# The dataset and use case
- The dataset is looking at a number of basic tasks done with MS and control patients through an open data platform. The data is all collected through the iOS floodlight app
- Build a classifier for accurately differentiating the MS from the control patients



# Load the dataset and setup environment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.\

In [None]:
##check current directory
os.getcwd()

In [None]:
###change to input directory
os.chdir("/kaggle/input")
os.listdir()

In [None]:
data = pd.read_csv("complete_dataset.csv")

### quick look of the dataset

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

Data quailty assessment
- 3 columns are not be able to use for building the model due to lack of data
- large number of empty value as many tests are not completed by the users
- not very high quality data

# Cleaning and manipulating the dataset
-  drop columns with too many null values
-  participantCreatedOn,testStartedAt, testEndedAt, testResultMetricCreatedOn should be datetime data type 
-  participantBirthYear, participantWeightLbs, participantHeightCms, testResultMetricId should be integer
-  testMetricName and code are repetitive same as testName and testCode

In [None]:
##drop unnamed column and check if successed 
data = data.drop("Unnamed: 19", 1).drop("testResultMetricTimestamp1", 1).drop("testResultMetricTimestamp2", 1)
data.info()

In [None]:
data.describe()

In [None]:
for col in ['participantCreatedOn','testStartedAt', 'testEndedAt', 'testResultMetricCreatedOn' ]:
    data[col] = pd.to_datetime(data[col])

In [None]:
for col in [ "participantBirthYear", "participantWeightLbs", "participantHeightCms", "testResultMetricId" ] :
    data[col] = data[col].astype(int)


In [None]:
data.info()

In [None]:
print(np.unique(data['testMetricCode']))
print(np.unique(data['testMetricName']))
print(np.unique(data['testName']))
print(np.unique(data['testCode']))
###name and code here are repetitive

In [None]:
data = data.drop(['testCode', "testMetricCode"], axis = 1)

In [None]:
data['test_fullname'] = data['testName'] +"-"+ data["testMetricName"]
data = data.drop((["testName", "testMetricName"]), axis =1)

In [None]:
data.columns

In [None]:
data.groupby(by = "floodlightOpenId")["participantCreatedOn"].count().sort_values(ascending=False).head()

# Further manipulating 
- establish a pivot table


In [None]:
grouped_data=data.groupby(['floodlightOpenId', 'participantIsControl', 
                                'participantSex', 'participantBirthYear']).\
  size().reset_index(name='count').sort_values('count', ascending=False).head()

In [None]:
import re
reg_part = re.compile(r"^part")
reg_test = re.compile(r"^test")
part_data_cols = list(filter(reg_part.search, data.columns))
test_data_cols = list(filter(reg_test.search, data.columns))
print(part_data_cols)
print(test_data_cols)
###alternative 
##part_cols = [x for x in data.columns if x.startswith('participant')]
##test_cols = [x for x in data.columns if x.startswith('test')]

In [None]:
pivot_data = data.pivot_table(index=['floodlightOpenId']+part_data_cols, columns = ['test_fullname'], values= ["testResultMetricValue"])
pivot_data.head()

In [None]:
measurement_data = pivot_data.reset_index()

In [None]:
measurement_data.info()

# Explore data

- Explore the MS distribution regards to genders
- Explore the MS distribution regards to region
- Explore the MS distribution regards to patient birth year

In [None]:
MS_data = measurement_data[measurement_data["participantIsControl"] == True]
MS_data = MS_data.drop((['participantIsControl']),axis=1)

In [None]:
MS_data["participantSex"].value_counts().plot(kind ="bar")
##With in MS patient, female is slightly more than male

In [None]:
MS_data["participantCountryOfResidence"].value_counts().plot(kind = "bar")

In [None]:
MS_data["participantBirthYear"].value_counts().plot(kind = "bar")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
measurement_data['participantIsControl'] = measurement_data['participantIsControl'].map(lambda x: 'Healthy' if x else 'MS')
##Use pair plot to discover each column
sns.pairplot(data=measurement_data[part_data_cols], hue = "participantIsControl")

# Preprocessing
- fill NA values
- change participantIsControl and participantSex to 0 or 1

In [None]:
measurement_data.isnull().sum()

In [None]:
measurement_data["testResultMetricValue"].describe()

In [None]:
measurement_data["testResultMetricValue"] = measurement_data["testResultMetricValue"].apply(lambda x : x.fillna(x.mean(), axis = 0))
measurement_data["participantSex"] = measurement_data["participantSex"].map(lambda x : 1 if x == "female" else 0 )
measurement_data.rename(columns = {"participantSex": "participantIsFemale"}, inplace= True)
measurement_data["participantIsControl"] = measurement_data["participantIsControl"].map(lambda x :0 if x =="MS" else 1)

##  Further featrue engineering
- Scale data
- use lable encoder

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
measurement_data.head()

In [None]:
##normalize data 
df = measurement_data.copy()
df["testResultMetricValue"] = preprocessing.scale(df["testResultMetricValue"], with_mean = True, with_std = True)

In [None]:
df["testResultMetricValue"].head()

In [None]:
y_col = df["participantIsControl"]
##df_X = df.iloc[:, 3::]
df_X = df.iloc[:, 4::]
df_X.head()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
part_cols = ["participantIsFemale", "participantBirthYear", "participantWeightLbs", "participantHeightCms"]


In [None]:
df_X[part_cols] = df_X[part_cols].apply(le.fit_transform)
df_X.head()

# Build the models with machine learning algorithmns
- partition the data
- use logistic regression and random forest algorithmns to build two classifier models

## Model evaluation
- use accuracy as evaluation metric
- confusion matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X, y_col, test_size=0.2, random_state=0)

In [None]:
logreg = LogisticRegression(solver="liblinear", C=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

Accuracy = 0.78
False positive rate = 0.16

In [None]:
##now try with randomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=200)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy = 0.74
False postive rate = 0.18

# Build model with a deep learning algorithmn - nerual network


In [None]:
y_train = y_train.to_frame()

In [None]:
y_test = y_test.to_frame()

In [None]:
X_train.shape[1]

The model contains 3 layers, the first two layers use relu as activation function, the last layer use sigmoid function, as this is a binary classifier. The loss function is binary crossentropy

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential # initialize neural network library
from keras.layers import Dense # build our layers library
def build_classifier():
    classifier = Sequential() # initialize neural network
    classifier.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu', input_dim = X_train.shape[1]))
    classifier.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 3)
mean = accuracies.mean()
variance = accuracies.std()
print("Accuracy mean: "+ str(mean))
print("Accuracy variance: "+ str(variance))

The accuracy is 0.70, which is lower than logistic regression and random forest model

# Conclusion
turned out in this case (small dataset, simple binary classfication task), deep learning algorithmn does not outperform basic logistic regression 