In [7]:
#### Importing Libraries ####

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import time

dataset = pd.read_csv('~/Desktop/COVID/Udemy/Udemy Machine learning case studies/Fintech app subscribers/new_appdata10.csv')

In [8]:
#### Data Pre-Processing ####

# Splitting Independent and Response Variables. Drop the response variable from the dataset
response = dataset["enrolled"]
dataset = dataset.drop(columns="enrolled")


In [9]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, response,
                                                    test_size = 0.2,
                                                    random_state = 0)

In [10]:
## Balancing the Training Set
#import random
#y_train.value_counts()
#
#pos_index = y_train[y_train.values == 1].index
#neg_index = y_train[y_train.values == 0].index
#
#if len(pos_index) > len(neg_index):
#    higher = pos_index
#    lower = neg_index
#else:
#    higher = neg_index
#    lower = pos_index
#
#random.seed(0)
#higher = np.random.choice(higher, size=len(lower))
#lower = np.asarray(lower)
#new_indexes = np.concatenate((lower, higher))
#
#X_train = X_train.loc[new_indexes,]
#y_train = y_train[new_indexes]


In [11]:
# Removing Identifiers as we dont need them now, but will need to associate the 
# ID's with teh results later, so store them for future use
train_identity = X_train['user']
X_train = X_train.drop(columns = ['user'])
test_identity = X_test['user']
X_test = X_test.drop(columns = ['user'])

In [12]:
# Feature Scaling / Normalization (we dont want a particular variable to have a greater
# influence on the model simply because it's absolute values are large. Therefore 
# we normalize)
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

# Scale the X_train data, returning a numpy array, losing the column name and index, which we care about since these 
# are needed to identify users and we need the column names for our model

# So we save the scaled data to its to a new dataframe. fit_transform changes the
# scaled arrays back into the desired format
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))

# we only transform our test set, becuase it has already been fitted to the training set
X_test2 = pd.DataFrame(sc_X.transform(X_test))

# Make sure the new datasets have the columns set to the original datasets
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values

#We then assign the same indexes from the original datasets
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values

#Now we make our new datasets the ones we work with
X_train = X_train2
X_test = X_test2

ValueError: could not convert string to float: '2013-01-09 20:42:47.430'

In [13]:
#### Model Building ####

# Fitting Model to the Training Set
from sklearn.linear_model import LogisticRegression

# Add a penalty L1, making the model a regular model to an L1 regularization model
# This is because of the funneling of variables we did earlier. The L1 response, penalizes andy fields
# that have strong correlation to the response variable.

# This is an important variable for working with mobile app screens since there may be a screen which always 
# appears before the enrolment screen. This may end up having a huge coefficient of 
# influence on the model. We want to discourage this.

classifier = LogisticRegression(random_state = 0, penalty = 'l1')
classifier.fit(X_train, y_train)




ValueError: could not convert string to float: '2013-03-12 18:22:38.045'

In [14]:
# Predicting Test Set
y_pred = classifier.predict(X_test)

NotFittedError: This LogisticRegression instance is not fitted yet

In [15]:
# Evaluating Results
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
precision_score(y_test, y_pred) # tp / (tp + fp)
recall_score(y_test, y_pred) # tp / (tp + fn)
f1_score(y_test, y_pred)

NameError: name 'y_pred' is not defined

In [None]:
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, fmt='g')
print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred))


# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score

#we want 10 folds (cv)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("SVM Accuracy: %0.3f (+/- %0.3f)" % (accuracies.mean(), accuracies.std() * 2))

# Analyzing Coefficients
pd.concat([pd.DataFrame(dataset.drop(columns = 'user').columns, columns = ["features"]),
           pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
           ],axis = 1)


#### Model Tuning ####

## Grid Search (Round 1)
from sklearn.model_selection import GridSearchCV

# Select Regularization Method
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Combine Parameters
parameters = dict(C=C, penalty=penalty)

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters


## Grid Search (Round 2)

# Select Regularization Method
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = [0.1, 0.5, 0.9, 1, 2, 5]

# Combine Parameters
parameters = dict(C=C, penalty=penalty)

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = "accuracy",
                           cv = 10,
                           n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds" % (t1 - t0))

rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters
grid_search.best_score_


#### End of Model ####


# Formatting Final Results
final_results = pd.concat([y_test, test_identity], axis = 1).dropna()
final_results['predicted_reach'] = y_pred
final_results = final_results[['user', 'enrolled', 'predicted_reach']].reset_index(drop=True)
