## Create a notebook within which you identify a high-quality predictive model using the pre-processed data

In [192]:
# importing the required packages and modules
# data handling modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# model handling modules
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegressionCV
# metrix module for model evalution 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import fbeta_score, make_scorer, recall_score,precision_score
import warnings
warnings.simplefilter("ignore")

np.random.seed(1)


## load pre processed data 

In [138]:
X_train = pd.read_csv('obes_train_X.csv')
y_train = pd.read_csv('obes_train_y.csv') 
X_test = pd.read_csv('obes_test_X.csv')
y_test = pd.read_csv('obes_test_y.csv') 

In [139]:
print("X_train shape",X_train.shape)
print("y_train shape",y_train.shape)
print("X_test shape",X_test.shape)
print("y_test shape",y_test.shape)

X_train shape (1477, 23)
y_train shape (1477, 1)
X_test shape (634, 23)
y_test shape (634, 1)


## create dataframe to store all metric value of models

In [140]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

## Logistic model

In [141]:
log_reg_model = LogisticRegressionCV(max_iter=2000,solver='newton-cg')
lrCVfit= log_reg_model.fit(X_train,np.ravel(y_train))
lr_y_pre=lrCVfit.predict(X_test)

In [142]:
# model_preds = dtRSCV.predict(X_test)
# print(len(model_preds))
c_matrix = confusion_matrix(y_test, lr_y_pre)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"LR", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## logistic regression CV with  Randomized Search & Grid Search 

In [171]:
%%time
# parameters for grid & random search Cv for Logistic
score_measure =   make_scorer(recall_score, average='micro')
kfolds = 3

param = {
    'solver'  : ['liblinear','newton-cg', 'lbfgs'],
}

# call Logistic regression 
LR=LogisticRegressionCV()

# grid Search CV call 
gridCV=GridSearchCV(estimator = LR, param_grid=param, cv=kfolds,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True,error_score='raise')
gridCV.fit(X_train,np.ravel( y_train))

# random Search CV call

randomCV=RandomizedSearchCV(estimator =LR, param_distributions=param, cv=kfolds,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True,error_score='raise')

randomCV.fit(X_train,np.ravel(y_train))

print("**************Logistic Grid Search CV********************")
print(f"The best {score_measure} score is {gridCV.best_score_}")
print(f"with parameters: {gridCV.best_params_}")

print(f"Best Estimators in grid cv :{gridCV.best_estimator_}")


print("**************Logistic Random Search CV********************")
print(f"The best {score_measure} score is {randomCV.best_score_}")
print(f"with parameters: {randomCV.best_params_}")

print(f"Best Estimators in grid cv :{randomCV.best_estimator_}")



Fitting 3 folds for each of 3 candidates, totalling 9 fits




Fitting 3 folds for each of 3 candidates, totalling 9 fits




**************Logistic Grid Search CV********************
The best make_scorer(recall_score, average=micro) score is 0.9485355959036265
with parameters: {'solver': 'newton-cg'}
Best Estimators in grid cv :LogisticRegressionCV(solver='newton-cg')
**************Logistic Random Search CV********************
The best make_scorer(recall_score, average=micro) score is 0.9485355959036265
with parameters: {'solver': 'newton-cg'}
Best Estimators in grid cv :LogisticRegressionCV(solver='newton-cg')
Wall time: 2min 43s




# Decision tree 

In [143]:
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
y_pred_dtree=dtree.predict(X_test)

In [144]:
c_matrix = confusion_matrix(y_test, y_pred_dtree)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"dtree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Decision tree Randomized search

In [173]:
score_measure =   make_scorer(recall_score, average='micro')
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(30,36),  
    'min_samples_leaf': np.arange(6,12),
    #'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    #'max_leaf_nodes': np.arange(162,168), 
    'max_depth': np.arange(15,21), 
    'criterion': ['entropy'],
    
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True,error_score='raise')

dtRSCV= rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_



Fitting 5 folds for each of 216 candidates, totalling 1080 fits
The best make_scorer(recall_score, average=micro) score is 0.9092693540998626
... with parameters: {'min_samples_split': 30, 'min_samples_leaf': 6, 'max_depth': 17, 'criterion': 'entropy'}


## Decision tree GridSearchCV

In [174]:
score_measure =   make_scorer(recall_score, average='micro')
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(30,36),  
    'min_samples_leaf': np.arange(6,12),
    #'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    #'max_leaf_nodes': np.arange(162,168), 
    'max_depth': np.arange(15,21), 
    'criterion': ['entropy'],
    
}
dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

dtGCV = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
The best make_scorer(recall_score, average=micro) score is 0.908593678424187
... with parameters: {'criterion': 'entropy', 'max_depth': 18, 'min_samples_leaf': 6, 'min_samples_split': 30}


# SVC model

In [145]:
svm_lin_model = SVC(kernel="linear")
svmfit= svm_lin_model.fit(X_train,y_train)
y_pred_SVC=svmfit.predict(X_test)

  y = column_or_1d(y, warn=True)


In [146]:
c_matrix = confusion_matrix(y_test, y_pred_SVC)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVC", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### SVC with different kernek in Gridsearch

In [179]:
%%time
param_grid = {'C': [0.1, 1, 10, 50], 
              'gamma': [1, 0.1, 0.01, 0.001],
              'degree':[1,2,3],
              'kernel': ['linear','rbf','poly'],
              'degree':[2,3,4],
             } 
score_measure =   make_scorer(recall_score, average='micro')
kfolds = 3

SVC_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
SVC_grid.fit(X_train,np.ravel( y_train))

print(SVC_grid.best_params_)
print(SVC_grid.best_estimator_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END C=0.1, degree=2, gamma=1, kernel=linear;, score=0.848 total time=   0.0s
[CV 2/5] END C=0.1, degree=2, gamma=1, kernel=linear;, score=0.814 total time=   0.0s
[CV 3/5] END C=0.1, degree=2, gamma=1, kernel=linear;, score=0.817 total time=   0.0s
[CV 4/5] END C=0.1, degree=2, gamma=1, kernel=linear;, score=0.797 total time=   0.0s
[CV 5/5] END C=0.1, degree=2, gamma=1, kernel=linear;, score=0.824 total time=   0.0s
[CV 1/5] END C=0.1, degree=2, gamma=1, kernel=rbf;, score=0.240 total time=   0.0s
[CV 2/5] END C=0.1, degree=2, gamma=1, kernel=rbf;, score=0.236 total time=   0.0s
[CV 3/5] END C=0.1, degree=2, gamma=1, kernel=rbf;, score=0.234 total time=   0.0s
[CV 4/5] END C=0.1, degree=2, gamma=1, kernel=rbf;, score=0.231 total time=   0.0s
[CV 5/5] END C=0.1, degree=2, gamma=1, kernel=rbf;, score=0.247 total time=   0.0s
[CV 1/5] END C=0.1, degree=2, gamma=1, kernel=poly;, score=0.976 total time=   3.4s
[CV 2/5]

In [147]:
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,LR,0.921569,0.948718,0.902439,0.925
0,dtree,0.925466,0.912088,0.954023,0.932584
0,SVC,0.885714,0.907895,0.884615,0.896104


# Analysis
- One of the major Health  problems is Obesity on world as per WHO  
- The purpose the process is to dedicate the obesity and level of obesity. 
- For this purpose we selected UCI Oobesity data https://archive.ics.uci.edu/ml/datasets/Estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition+#
- We selected Logistic regression Classfication,SVC, dtree. 
- from performance table we can say that dtree is giving the best score based on recall .95 
- dtree i good model for this process
