## Part III: Machine Learning Model Training

In [1]:
#import libraries
import pandas as pd

from sklearn.model_selection import train_test_split 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#prepare our independent and dependent variables
df = pd.read_csv("S://Stackup Dev//Data Science- Diabetes Prediction//diabetes_data_clean.csv")

X = df.drop('class', axis=1)
y = df['class']

In [3]:
#class
y

0      1
1      1
2      1
3      1
4      1
      ..
515    1
516    1
517    1
518    0
519    0
Name: class, Length: 520, dtype: int64

In [4]:
#split data into train and test

X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, test_size=0.24, stratify = y )

In [5]:
#begin our model training
#start with DummyClassifier to establish baseline

dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [6]:
confusion_matrix(y_test, dummy_pred)

array([[ 0, 48],
       [ 0, 77]], dtype=int64)

In [7]:
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.62      1.00      0.76        77

    accuracy                           0.62       125
   macro avg       0.31      0.50      0.38       125
weighted avg       0.38      0.62      0.47       125



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#start with Logistic Regression
logr = LogisticRegression(max_iter=10000)
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)

In [9]:
confusion_matrix(y_test, logr_pred)

array([[46,  2],
       [ 6, 71]], dtype=int64)

In [10]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92        48
           1       0.97      0.92      0.95        77

    accuracy                           0.94       125
   macro avg       0.93      0.94      0.93       125
weighted avg       0.94      0.94      0.94       125



In [11]:
#try DecisionTree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [12]:
confusion_matrix(y_test, tree_pred)

array([[44,  4],
       [ 6, 71]], dtype=int64)

In [13]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90        48
           1       0.95      0.92      0.93        77

    accuracy                           0.92       125
   macro avg       0.91      0.92      0.92       125
weighted avg       0.92      0.92      0.92       125



In [14]:
#Random Forest

forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [15]:
confusion_matrix(y_test, forest_pred)

array([[47,  1],
       [ 3, 74]], dtype=int64)

In [16]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96        48
           1       0.99      0.96      0.97        77

    accuracy                           0.97       125
   macro avg       0.96      0.97      0.97       125
weighted avg       0.97      0.97      0.97       125



In [17]:
forest.feature_importances_

array([0.10129679, 0.10451728, 0.23416667, 0.17834348, 0.06335345,
       0.02119805, 0.02984481, 0.01797706, 0.03471583, 0.03017926,
       0.02467735, 0.03237081, 0.04130914, 0.02427586, 0.04100834,
       0.02076583])

In [20]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [21]:
pd.DataFrame({'feature': X.columns, 'importance': forest.feature_importances_}
            ).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.234167
3,polydipsia,0.178343
1,ismale,0.104517
0,age,0.101297
4,sudden weight loss,0.063353
12,partial paresis,0.041309
14,alopecia,0.041008
8,visual blurring,0.034716
11,delayed healing,0.032371
9,itching,0.030179
