In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

### Examining the Data

In [None]:
# Let's start with reading the data 

users_data = pd.read_pickle("../../data/users_data_final.pkl")

In [None]:
# look at a preview of the data

users_data.head(n = 5)

In [None]:
# look at the shape of the data (rows, columns)

users_data.shape

In [None]:
# How many users do we have in the dataframe?

len(np.unique(users_data.user_id))

# as expected, this is same as the number of rows in the data

In [None]:
users_data.info()

### Feature Selection: Drop the Date Joined Variable

In [None]:
# Did you come up with a way to engineer date joined variable?
# if not drop this variable

users_data.drop(['date_joined'], axis = 1, inplace = True)


### Feature Engineering: Numerical Data

In [None]:
# identify all the numerical features



In [None]:
# replace missing numerical values with 0



In [None]:
# identify highly correlated variables
# look at the correlations, can you see any combinations with



In [None]:
# identify outliers, lets start with age 
# draw a box plot for age



In [None]:
# draw a histogram for age



In [None]:
# there is very minimal skew, let's assume that age is normally distributed and remove the outliers



In [None]:
# subset the dataframe, so we only consider ages which are greater than LB and less than UB



### Feature Engineering: Categorical Data

In [None]:
# identify all the categorical features



In [None]:
# We want to look at the distribution of e.g. jobs across all users
# We could use the value_counts function to get a count of unique values



In [None]:
# identify and replace missing values

users_data_cleaned.isnull().sum()

# question: which feature has the most missing data?

In [None]:
# drop the feature that has a high percentage of missing data



In [None]:
# replace missing values with "Unknown"

users_data_cleaned['job'].fillna("Unknown", inplace = True)
users_data_cleaned['education'].fillna("Unknown", inplace = True)
users_data_cleaned['contact'].fillna("Unknown", inplace = True)
users_data_cleaned['device'].fillna("Unknown", inplace = True)


In [None]:
# before building our model, we need to encode the categorical data
# lots of ways to do that, but we will use pd.get_dummies function
# lets start with marital



In [None]:
# drop any feature that has more than 10 categories



### Building the first machine learning model

In [None]:
# check the shape of the cleaned data

users_data.shape

In [None]:
# check all numerical

users_data.info()

In [None]:
# Define the target, our y
# lets say we want to predict the total_amount_usd

users_data.total_amount_usd.hist()
plt.title("Distribution of the total_amount_usd");

# Distribution of variable is skewed, we could also transform the variable in next iteraions


In [None]:

# Outliers exist, in next iterations we may want to remove them

In [None]:
# Step 1. Define the target and features
# A supervised machine learning algorithm requires both - uses historical data to uncover relationships between other features of your dataset and the target.

target_data = users_data.total_amount_usd
features = users_data.drop(["total_amount_usd"], axis = 1)


In [None]:
# Step 2. Split the data into a training and test set
# The training data is the data we use to train the machine learning algorithm
# the test set is used to evaluate the prediction

# using this handy function from scikit-learm to split the data into a training and test dataset
# we can adjust the test size to our needs, but it's best practise to train the model on 70 - 80% of the data

X_train, X_test, y_train, y_test = train_test_split(
                                                features, 
                                                target_data,
                                                test_size = 0.2, 
                                                random_state = 42)


In [None]:
# print the shape of the training data

print("Training Data")
print(X_train.shape)
print(y_train.shape)

In [None]:
# print the shape of the test data 

print("\nTest Data")
print(X_test.shape)
print(y_test.shape)

In [None]:
# Step 3. Import the Logistic Regression model from sklearn

lr = LinearRegression()

In [None]:
# Step 4. Fit the model on the scaled training data

lr.fit(X_train, y_train) 

# This is your machine learning model!

In [None]:
# Step 5. Evaluate how well the model predicts on unseen data

y_pred = lr.predict(X_test) 


In [None]:
print("first five predicted total amounts:", y_pred[0:5])
print("first five actual total amounts:", list(y_test[0:5]))

In [None]:
# We will use r2 to evaluating the model performance.

r2_score(y_test, y_pred)

# R^2 (coefficient of determination) regression score function.
# Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). 

# Pretty close to 0. Lets see over the next few weeks we can improve the score.
