In [2]:
# importing libraries 
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
# importing the dataset
floodlight_data = pd.read_csv('../data/part2/complete_dataset.csv')

In [3]:
# info on data such as data type 
floodlight_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287521 entries, 0 to 287520
Data columns (total 20 columns):
floodlightOpenId                 287521 non-null object
participantCreatedOn             287521 non-null object
participantIsControl             287521 non-null bool
participantCountryOfResidence    287521 non-null object
participantSex                   287521 non-null object
participantBirthYear             287521 non-null int64
participantWeightLbs             287521 non-null float64
participantHeightCms             287521 non-null float64
testName                         287521 non-null object
testCode                         287521 non-null object
testMetricName                   287521 non-null object
testMetricCode                   287521 non-null object
testStartedAt                    287521 non-null object
testEndedAt                      287521 non-null object
testResultMetricId               287521 non-null int64
testResultMetricCreatedOn        287521 non-null ob

In [160]:
# Statistic on the data
floodlight_data.describe()

Unnamed: 0,participantBirthYear,participantWeightLbs,participantHeightCms,testResultMetricId,testResultMetricTimestamp1,testResultMetricTimestamp2,testResultMetricValue,Unnamed: 19
count,287521.0,287521.0,287521.0,287521.0,0.0,0.0,287521.0,0.0
mean,1971.339325,171.369493,168.115317,163581.601466,,,28.601507,
std,12.225201,54.048453,21.287611,94241.026701,,,243.872635,
min,1900.0,3.0,1.0,66.0,,,0.0,
25%,1963.0,135.0,163.0,72587.0,,,0.14,
50%,1971.0,161.0,170.0,171387.0,,,1.0,
75%,1979.0,190.0,178.0,244971.0,,,6.21,
max,2020.0,530.0,250.0,317566.0,,,45418.5,


#### data asessment:

1. 3 columns are empty. So need to drop it.
2. Many values are missing. 
3. Data types are not appropriate for multiple columns. Need to change it to datetime datatype and int data type
4. Many variable are catgorical like Sex, ParticipantControl, Country. So need to do encoding like label encoding, and one hot encoding
5. Some columns contains same info such as testname and testcode. Need to delete columns that have repetitive info.
6. Need to reduce features by combining and cross-featuring. For example, getting BMI from weight and height, Age from birth years and testStartedAt etc.

**Overall, we need to perform lots of feature engineering on the raw floodlight data to build classifier.**

In [4]:
''' this function is about data processing for building model: 
    1) dropping empty columns
    2) changing data types for multiple columns
    3) dropping columns that contain same info and unnecessary for building model
    
    Arguments:
    floodlight_data -- pandas df of floodlight data
    
    Return:
    processed_floodlight_data -- this is processed floodlight df  
'''

def data_processing(floodlight_data):
    
    # list of columns to drop
    drop_col_list = ['testResultMetricTimestamp1','testResultMetricTimestamp2','Unnamed: 19', 'testCode', 'testMetricCode']
    processed_floodlight_data = floodlight_data.drop(drop_col_list,axis=1,errors='ignore')
    
    # Apply datetime datatype to datetime columns
    date_col_list = ['participantCreatedOn', 'testStartedAt', 'testEndedAt', 'testResultMetricCreatedOn']
    processed_floodlight_data[date_col_list] = processed_floodlight_data[date_col_list].apply(pd.to_datetime)
    
    # Change to int data type 
    int_col_list = ['participantBirthYear', 'participantWeightLbs', 'participantHeightCms', 'testResultMetricId']
    processed_floodlight_data[int_col_list] = processed_floodlight_data[int_col_list].apply(pd.to_numeric, downcast='integer')
    
    
    return processed_floodlight_data

In [5]:
''' Feature engineering function steps:
    1. Creating new feature from combining 'testName' and 'testMetricName'.
    2. Creating new feature Body Mass Index (BMI) from height and weight. 
       Removing height and weight, keeping only BMI.
    3. Creating new feature Age from birth year and testStartedAt. 
       Removing birth year and testStartedAt, keeping age only.
    4. Creating new feature 'participantTestTime' from 'testEndedAt' and 'testStartedAt'
    5. Implement one hot encoding for 'participantCountryOfResidence'. 
       Because country is categorical variable. 
    6. Categorical column grouping - because appearantly participant has participated in multiple test
    7. Change Boolean True/False values to 1/0 for 'participantSex' 
       and 'participantIsControl'
    8. Filling missing values with mean for 'testResultMetricValue'
    9. Standardizing/normalizing the testResult Metric value
    
    Arguments:
    processed_floodlight_data -- this is output of 'data_processing(floodlight_data)'
    
    Return:
    floodlight_data_with_features -- dataframe with feature engineering 
    
'''

def feature_engineering(processed_floodlight_data):
    
    
    # Combining testName and testMetricName into one feature 'test_long_name'
    processed_floodlight_data['test_long_name'] = processed_floodlight_data['testName']+"_"+processed_floodlight_data['testMetricName']
    floodlight_data_with_features = processed_floodlight_data.drop(['testName','testMetricName'], axis=1, errors='ignore')
    
    
    ''' creating new feature BMI: 
    1. converting participant height and weight into kg and M
    2. calculating BMI 
    3. deleting height and weight columns and keeping only BMI
    '''
    floodlight_data_with_features['participantHeightM'] = floodlight_data_with_features['participantHeightCms'].apply(lambda x: x/100)
    floodlight_data_with_features['participantWeightKg'] = floodlight_data_with_features['participantWeightLbs'].apply(lambda x: x*0.454)
    floodlight_data_with_features['participantBMI'] = floodlight_data_with_features['participantWeightKg']/(floodlight_data_with_features['participantHeightM'] **2)
    floodlight_data_with_features.drop(['participantHeightM', 'participantWeightKg', 'participantHeightCms', 'participantWeightLbs'], axis=1, inplace=True, errors='ignore')
    
    
    
    # calculate new feature Age from birth year and testStartedAt
    floodlight_data_with_features['participantAge'] = [floodlight_data_with_features.testStartedAt[i].year - floodlight_data_with_features.participantBirthYear[i] 
                    for i in range(0, len(floodlight_data_with_features))]
    
    
    
    # total test time computed from 'testEndedAt' and 'testStartedAt'
    floodlight_data_with_features['participantTestTime'] = (floodlight_data_with_features['testEndedAt'] - floodlight_data_with_features['testStartedAt']).apply(lambda x: x.seconds)
    floodlight_data_with_features.drop(['testStartedAt', 'testEndedAt', 'participantBirthYear'], axis = 1, inplace=True, errors='ignore')
    
    
    '''One hot encoding for 'participantCountryOfResidence'
    'participantCountryOfResidence' is categorical variable. 
     We have categories with countries abbreviation such as IT, US, CA, PL etc
     We don't need ordinal ranking. So, lets do One hot encoding instead of label encoding.
    '''
    # creating one hot encoding object
    onehotencoder = OneHotEncoder()
    
    # reshaping 1D country array to 2D and as fit_transform expect 2D input and the fitting the input into object
    participantCountry = onehotencoder.fit_transform(floodlight_data_with_features.participantCountryOfResidence.\
                                                     values.reshape(-1,1)).toarray()
    
    # creating one hot encoded dataframe
    OneHot_df = pd.DataFrame(participantCountry, columns = ["participantCountry_"+str(int(i))\
                                                           for i in range(floodlight_data_with_features.shape[1])]) 
    
    # concatenation of original df and one hot encoded df
    floodlight_data_with_features = pd.concat([floodlight_data_with_features, OneHot_df], axis=1)
    
    # dropping column participantCountryOfResidence
    floodlight_data_with_features.drop(['participantCountryOfResidence'], axis = 1, inplace=True, errors='ignore')
    
    
    
    # getting list of categorical columns to group and then categorial grouping
    columns_to_group = [col for col in floodlight_data_with_features.columns if col.startswith('participant')]
    floodlight_data_with_features = floodlight_data_with_features.pivot_table(index=['floodlightOpenId']+columns_to_group, columns = ['test_long_name'], values= ["testResultMetricValue"])
    floodlight_data_with_features = floodlight_data_with_features.reset_index()
    
    
    
    # Change True/False to 1/0
    floodlight_data_with_features['participantSex'] = floodlight_data_with_features['participantSex'].replace({'female': 1, 'male': 0})
    floodlight_data_with_features['participantIsControl'] = floodlight_data_with_features['participantIsControl'].astype(int)
    
    
    
    # filling NA values of testResultMetricValue with its mean
    floodlight_data_with_features['testResultMetricValue']= floodlight_data_with_features['testResultMetricValue'].\
    fillna((floodlight_data['testResultMetricValue'].mean()))
    
    
    # standardizing/normalizing the testResult Metric value
    floodlight_data_with_features["testResultMetricValue"] = preprocessing.scale(floodlight_data_with_features["testResultMetricValue"], \
                                                                             with_mean = True, with_std = True)
    
    return floodlight_data_with_features

In [6]:
def training_testing_set(floodlight_data):
    '''
    Steps for creating training and test dataset for classifier
    1) It calls two functions:
       i) data_processing
       ii) feature_engineering   
    2) Target label encoding 
    3) Building train and test set using 'train_test_split function'
    
    Arguments: 
    floodlight_data -- pandas df of floodlight data
    
    Return:
    X_train -- Training input data set, 
    X_test -- Test input data set
    Y_train -- Training target data 
    Y_test -- Test target data
    
    ''' 
    # data processing: calling above data_processing function
    processed_floodlight_data = data_processing(floodlight_data)
    
    # feature engineering: calls above feature_engineering function
    floodlight_data_with_features = feature_engineering(processed_floodlight_data)
    
    # creating input data set from feature engineering data
    X = floodlight_data_with_features.iloc[:, 3::]
    
    # target label data
    Y = floodlight_data_with_features['participantIsControl']
    
    # Encoding target labels of "participantIsControl"
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(Y)  
    
    # Build training and test set 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)
    
    print("train X shape: " + str(X_train.shape))
    print("test X shape: " + str(X_test.shape))
    print("Y_train shape: " + str(Y_train.shape))
    print("Y_test shape: " + str(Y_test.shape))
    
    return X_train, X_test, Y_train, Y_test

## Logistic Regression

In [127]:
'''Logistics Regression model
'''
# getting train and test dataset
X_train, X_test, Y_train, Y_test = training_testing_set(floodlight_data)
    
# Logistic regression model
model = LogisticRegression(solver="liblinear", C=1000)
model.fit(X_train, Y_train)
prediction = model.predict(X_test)
 
# printing accuracy of train and test dataset    
print('Logistic Regression classifier accuracy on training set: {:.4}%'.format(model.score(X_train, Y_train)*100))
print('Logistic Regression classifier accuracy on test set: {:.4}%'.format(model.score(X_test, Y_test)*100))

train X shape: (17587, 37)
test X shape: (4397, 37)
Y_train shape: (17587,)
Y_test shape: (4397,)
Logistic Regression classifier accuracy on training set: 74.86%
Logistic Regression classifier accuracy on test set: 75.37%


Our classifier based on Logistics Regression is giving almost 76% accuracy on test data set. We try another ML model like Random Forest which gives better results when many features are involved in the data.

## Random Forest

In [7]:
'''Random Forest'''

# getting train and test dataset
X_train, X_test, Y_train, Y_test = training_testing_set(floodlight_data)

'''random forest
   hyperparamers: 1) n_estimators; 2) max_depth
'''
forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
forest.fit(X_train, Y_train)
forest_predict = forest.predict(X_test)

# printing accuracy of train and test dataset
print('Random Forest classifier training accuracy: {:.4}%'.format(forest.score(X_train, Y_train)*100))
print('Random Forest classifier testing accuracy: {:.4}%'.format(forest.score(X_test, Y_test)*100))

train X shape: (17587, 37)
test X shape: (4397, 37)
Y_train shape: (17587,)
Y_test shape: (4397,)
Random Forest classifier training accuracy: 75.39%
Random Forest classifier testing accuracy: 75.16%


**Random Forest is giving almost same result as Logistics Regression**


Random forest has two hyperparameter *n_estimator* and *max_depth*. Lets try to tune these two hyperparameters and see if we can improve the accuracy of our random forest classifier.


### Hyperparameter tuning of random forest

*This will take some time to get best parameter for random forest. So, have some patience :)*

In [131]:
# This will give us tuned hyperparameters for best accuracy of random forest classifier

# in param_grid, you can enter range of 'n_estimators' and 'max_depth'
# n_estimators = no. of trees in the forest
# max_depth = max. number of levels in each decision tree
# max_features = number of features considered for best splitting of a node
# criterion = The function to measure the quality of a split. “gini” is for Gini impurity and “entropy” is for information gain.
# cv = number of folds to use for cross validation
param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [10, 11, 12, 13, 14, 15, 20, 25, 30],
    'criterion' :['gini', 'entropy']
}

grid_forest = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5)
grid_forest.fit(X_train, Y_train)

grid_forest.best_params_

{'criterion': 'entropy',
 'max_depth': 30,
 'max_features': 'auto',
 'n_estimators': 200}

Let's try these tuned hyperparameters for our Random Forest. *max_depth ( max number of levels in each decision tree)* was only mentioned up to 30 for hypertuning. So, we could try more *max_depth* and see if it gets better result than 30 with parameters we got above.     

### Tuned Random Forest

In [20]:
'''Tuned random forest hyperparamers: 
   1) n_estimators = 200; 
   2) max_depth = 60
   3) max_features='auto
   4) criterion='entropy'
'''
# getting train and test dataset
#X_train, X_test, Y_train, Y_test = training_testing_set(floodlight_data)

# max_depth = 60 worked slighly better for test set than max_depth = 30
tuned_forest = RandomForestClassifier(max_features='auto', n_estimators=200, max_depth=60, \
                                      criterion='entropy', random_state=42)
tuned_forest.fit(X_train, Y_train)
tuned_forest_predict = tuned_forest.predict(X_test)

# printing accuracy of train and test dataset
print('Accuracy of Tuned Random Forest classifier for training set: {:.4}%'.format(tuned_forest.score(X_train, Y_train)*100))
print('Accuracy of Tuned Random Forest classifier for test set: {:.4}%'.format(tuned_forest.score(X_test, Y_test)*100))

Accuracy of Tuned Random Forest classifier for training set: 99.99%
Accuracy of Tuned Random Forest classifier for test set: 87.4%


**With our tuned hyperparameters, Random Forest is performing very well compared to Logistics Regression. We almost got 100% accuracy for training dataset. And 87.4% accuracy for testing dataset.**

## Random Forest is the Winner!