# 1.  Analyse ML Python

<img src='https://github.com/retkowsky/images/blob/master/AzureMLservicebanniere.png?raw=true'>

## 1. Informations

In [1]:
import sys
sys.version

'3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) \n[GCC 7.3.0]'

In [2]:
import datetime
now = datetime.datetime.now()
print(now)

2020-03-12 09:42:51.761505


In [3]:
import pandas as pd
import numpy as np
import pickle
import os

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

## 2. Chargement des données

Framingham Heart study dataset

https://www.kaggle.com/amanajmera1/framingham-heart-study-dataset

Attributes/columns:

- male: 0 = Female; 1 = Male
- age: Age at exam time
- education: 1 = Some High School; 2 = High School or GED; 3 = Some College or Vocational School; 4 = college
- currentSmoker: 0 = nonsmoker; 1 = smoker
- cigsPerDay: number of cigarettes smoked per day (estimated average)
- BPMeds: 0 = Not on Blood Pressure medications; 1 = Is on Blood Pressure medications
- prevalentStroke
- prevalentHyp
- diabetes: 0 = No; 1 = Yes
- totChol in mg/dL
- sysBP in mmHg
- diaBP in mmHg
- BMI: Body Mass Index calculated as: Weight (kg) / Height(meter-squared)
- heartRate: Beats/Min (Ventricular)
- glucose in mg/dL

- TenYearCHD - Did the person get heart disease in the 10 years study period?
label; 0 = No for heart disease, 1 = Yes for heart disease;


In [4]:
if os.path.exists("framingham.csv"):
  os.remove("framingham.csv")

In [5]:
!wget "https://raw.githubusercontent.com/retkowsky/WorkshopMLOps/master/framingham.csv"

--2020-03-12 09:42:53--  https://raw.githubusercontent.com/retkowsky/WorkshopMLOps/master/framingham.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.36.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191803 (187K) [text/plain]
Saving to: ‘framingham.csv’


2020-03-12 09:42:53 (16.8 MB/s) - ‘framingham.csv’ saved [191803/191803]



In [6]:
df = pd.read_csv('framingham.csv')

In [7]:
df.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [9]:
df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4240.0,4240.0,4135.0,4240.0,4211.0,4187.0,4240.0,4240.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,1.979444,0.494104,9.005937,0.029615,0.005896,0.310613,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.151887
std,0.495027,8.572942,1.019791,0.500024,11.922462,0.169544,0.076569,0.462799,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.358953
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [10]:
df.corr()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
male,1.0,-0.029014,0.017415,0.197026,0.317143,-0.052504,-0.00455,0.005853,0.015693,-0.070413,-0.035879,0.058199,0.081871,-0.116932,0.005979,0.088374
age,-0.029014,1.0,-0.166356,-0.213662,-0.192959,0.123052,0.057679,0.306799,0.101314,0.262554,0.394053,0.205586,0.136096,-0.012843,0.122356,0.225408
education,0.017415,-0.166356,1.0,0.018528,0.008323,-0.010898,-0.035142,-0.081726,-0.038749,-0.023613,-0.129496,-0.061919,-0.137747,-0.054182,-0.035843,-0.054248
currentSmoker,0.197026,-0.213662,0.018528,1.0,0.769774,-0.048927,-0.03298,-0.10371,-0.044285,-0.046488,-0.130281,-0.107933,-0.167857,0.062686,-0.056726,0.019448
cigsPerDay,0.317143,-0.192959,0.008323,0.769774,1.0,-0.046155,-0.032711,-0.066645,-0.037089,-0.026479,-0.088797,-0.056715,-0.093293,0.075564,-0.058886,0.057755
BPMeds,-0.052504,0.123052,-0.010898,-0.048927,-0.046155,1.0,0.11737,0.261067,0.05206,0.080623,0.254194,0.194122,0.100702,0.01523,0.051197,0.087519
prevalentStroke,-0.00455,0.057679,-0.035142,-0.03298,-0.032711,0.11737,1.0,0.074791,0.006955,0.000105,0.057,0.045153,0.025909,-0.017674,0.01844,0.061823
prevalentHyp,0.005853,0.306799,-0.081726,-0.10371,-0.066645,0.261067,0.074791,1.0,0.077752,0.163632,0.696656,0.61584,0.301344,0.146815,0.086656,0.177458
diabetes,0.015693,0.101314,-0.038749,-0.044285,-0.037089,0.05206,0.006955,0.077752,1.0,0.040348,0.111265,0.05026,0.087068,0.048986,0.61763,0.097344
totChol,-0.070413,0.262554,-0.023613,-0.046488,-0.026479,0.080623,0.000105,0.163632,0.040348,1.0,0.208734,0.164698,0.115992,0.091127,0.046538,0.082369


In [11]:
df.shape

(4240, 16)

Préparation des données

In [12]:
smoke = (df['currentSmoker']==1)
df.loc[smoke,'cigsPerDay'] = df.loc[smoke,'cigsPerDay'].fillna(df.loc[smoke,'cigsPerDay'].mean())

In [13]:
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)

## 3. Modélisation Random Forest

In [14]:
features = df.iloc[:,:-1]
result = df.iloc[:,-1] # the last column is what we are about to forecast

Partitionnement des données

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, result, test_size = 0.2, random_state = 14)

Random Forest

In [16]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [17]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.12
sfm = SelectFromModel(clf, threshold=0.12)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.12)

In [18]:
feat_labels = list(features.columns.values) # creating a list with features' names
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

age
prevalentHyp
sysBP
glucose


In [19]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 1 (0.242214)
2. feature 10 (0.200736)
3. feature 14 (0.152858)
4. feature 7 (0.139117)
5. feature 11 (0.105004)
6. feature 0 (0.034898)
7. feature 12 (0.034277)
8. feature 4 (0.022181)
9. feature 5 (0.016595)
10. feature 8 (0.015724)
11. feature 9 (0.014800)
12. feature 13 (0.009745)
13. feature 2 (0.007321)
14. feature 6 (0.004530)
15. feature 3 (0.000000)


In [20]:
# With only imporant features. Can check X_important_train.shape[1]
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [21]:
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

## 4. Métriques du modèle

In [22]:
predictions_y_4 = clf_important.predict(X_important_test)

print("============================")
print("Classification Report")
print("============================")
print(classification_report(y_test, predictions_y_4))
print("")

print("============================")
print("Confusion Matrix")
print("============================")
print(confusion_matrix(y_test, predictions_y_4))
print("")

# Under ROC curve
print("============================")
print("ROC")
print("============================")
prob_y_4 = clf_important.predict_proba(X_important_test)
prob_y_4 = [p[1] for p in prob_y_4]
print(roc_auc_score(y_test, prob_y_4))
print("")

print("============================")
print("Accuracy score")
print("============================")
accuracy_score(y_test, predictions_y_4)

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.95      0.91       724
           1       0.27      0.10      0.14       124

   micro avg       0.83      0.83      0.83       848
   macro avg       0.56      0.53      0.52       848
weighted avg       0.77      0.83      0.79       848


Confusion Matrix
[[691  33]
 [112  12]]

ROC
0.6730306540723578

Accuracy score


0.8290094339622641

## 5. Export du modèle de ML

In [23]:
os.makedirs('./outputs', exist_ok=True)

In [24]:
filename = './outputs/modele.pkl'

In [25]:
pickle.dump(clf_important, open(filename, 'wb'))
print("OK")

OK


In [27]:
!ls ./outputs/*.pkl -l

-rwxrwxrwx 1 root root 861609483 Mar 12 09:43 ./outputs/modele.pkl


## 6. Tests du modèle

In [28]:
loaded_model = pickle.load(open(filename, 'rb'))

In [29]:
age = 61
prevalentHyp = 1
sysBP = 150
glucose = 103
 
print("Résultat :", loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

Résultat : [1]


In [30]:
age = 43
prevalentHyp = 1
sysBP = 180
glucose = 99
 
print("Résultat :", loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

Résultat : [0]


In [31]:
age = 63
prevalentHyp = 0
sysBP = 138
glucose = 85
 
print("Résultat :", loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

Résultat : [1]


In [32]:
age = 52
prevalentHyp = 1
sysBP = 141
glucose = 75
 
print("Résultat :", loaded_model.predict([[age, prevalentHyp, sysBP, glucose]]))

Résultat : [0]


In [33]:
results = loaded_model.predict([[61, 1, 150, 103],[43, 1, 180, 99],[63,0,138,85]])
results

array([1, 0, 1])

In [34]:
import json

to_be_scored_json = {"data":[[61, 1, 150, 103],[43, 1, 180, 99],[63,0,138,85]]}
input_data_json = json.dumps(to_be_scored_json)

to_be_scored_list = json.loads(input_data_json)["data"]
to_be_scored_list=np.array(to_be_scored_list)
print(loaded_model.predict(to_be_scored_list).tolist())

[1, 0, 1]


> Fin