## Package Imports

In [68]:
import pandas as pd
import numpy as np
%matplotlib inline

In [69]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import warnings
# !pip uninstall ydata-profiling
# import ydata_profiling as pp


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


## Data Import

In [70]:
df = pd.read_csv('ml_data_drop.csv')

In [71]:
# pp.ProfileReport(df)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6388 entries, 0 to 6387
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Patient_ID  6388 non-null   float64
 1   Age         6388 non-null   float64
 2   SBP         6388 non-null   int64  
 3   BMI         6388 non-null   float64
 4   LDL         6388 non-null   float64
 5   HDL         6388 non-null   float64
 6   TG          6388 non-null   float64
 7   FBS         6388 non-null   float64
 8   Diabetes    6388 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 449.3 KB


In [73]:
df.shape

(6388, 9)

### Feature Selection:

In [74]:
column_names = df.columns
print(column_names)

Index(['Patient_ID', 'Age', 'SBP', 'BMI', 'LDL', 'HDL', 'TG', 'FBS',
       'Diabetes'],
      dtype='object')


## Classification Model Creation and training 

In [75]:
data = df

In [76]:
X = data.iloc[:,0:8]  #independent columns
y = data.iloc[:,-1] 

In [77]:
X.shape

(6388, 8)

In [78]:
y.shape

(6388,)

## DecisionTreeClassifier

In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Decision Tree classifier
dt_model = DecisionTreeClassifier()

# Train the Decision Tree model on the training data
dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.7769953051643192
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       703
           1       0.75      0.77      0.76       575

    accuracy                           0.78      1278
   macro avg       0.77      0.78      0.78      1278
weighted avg       0.78      0.78      0.78      1278

Confusion Matrix:
[[553 150]
 [135 440]]


## RandomForestClassifier

In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest classifier
rf_model = RandomForestClassifier()

# ... Your code to fit the RandomForestClassifier ...

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Make predictions with the RandomForestClassifier

# Train the Random Forest model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8286384976525821
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       703
           1       0.85      0.75      0.80       575

    accuracy                           0.83      1278
   macro avg       0.83      0.82      0.82      1278
weighted avg       0.83      0.83      0.83      1278

Confusion Matrix:
[[629  74]
 [145 430]]


## Crosss validation

In [81]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [82]:
# Create the Decision Tree model
dt_model = DecisionTreeClassifier()
dt_scores = cross_val_score(dt_model, X, y, cv=2)

# Create the Random Forest model
rf_model = RandomForestClassifier()
rf_scores = cross_val_score(rf_model, X, y, cv=2)


In [83]:
print("Mean Decision Tree Cross-Validation Score:", dt_scores.mean())
print("Mean Random Forest Cross-Validation Score:", rf_scores.mean())

print("Standard Deviation Decision Tree Cross-Validation Score:", dt_scores.std())
print("Standard Deviation Random Forest Cross-Validation Score:", rf_scores.std())


Mean Decision Tree Cross-Validation Score: 0.7514088916718847
Mean Random Forest Cross-Validation Score: 0.819818409517846
Standard Deviation Decision Tree Cross-Validation Score: 0.0
Standard Deviation Random Forest Cross-Validation Score: 0.0032874139010645176


## SVM

In [84]:
data = df

In [85]:
X = data.iloc[:,0:8]  #independent columns
y = data.iloc[:,-1] 

In [86]:
X.shape

(6388, 8)

In [87]:
y.shape

(6388,)

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# # Load your dataset (replace 'data.csv' with your dataset file)
# data = pd.read_csv('data.csv')

# # Assuming you have features in X and target variable in y
# X = data.drop('target', axis=1)
# y = data['target']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the SVM classifier with different options
# Example options: kernel='linear', C=1.0, gamma='scale', class_weight='balanced'
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced')

# svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', class_weight='balanced')


# C=1.0, kernel='rbf',

# Train the SVM model on the training data
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.4757433489827856


## Create Pickle Model

In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest classifier
rf_model = RandomForestClassifier()

# ... Your code to fit the RandomForestClassifier ...

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # Make predictions with the RandomForestClassifier

# Train the Random Forest model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8278560250391236
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       703
           1       0.85      0.75      0.80       575

    accuracy                           0.83      1278
   macro avg       0.83      0.82      0.82      1278
weighted avg       0.83      0.83      0.83      1278

Confusion Matrix:
[[625  78]
 [142 433]]


#### Save the model to a pickle file using joblib

In [90]:
import pickle
from joblib import dump, load

In [91]:
dump(rf_model, 'diab_pickle.joblib')

['diab_pickle.joblib']

In [92]:
rf_model = load('diab_pickle.joblib') 

#### Save the model to a pickle file using joblib

In [93]:
import pickle
from joblib import dump, load

In [94]:
dump(rf_model, 'diab_pickle.pkl')

['diab_pickle.pkl']

In [95]:
rf_model = load('diab_pickle.pkl') 

## Prediction

In [32]:
# diab_pickle.pkl

In [99]:
import pandas as pd
import joblib

# Load the trained model from a pickle file
loaded_model = joblib.load('diab_pickle.pkl')

# Assuming you have features in X_train and target in y_train
features = data.drop('Diabetes', axis=1)  # Features used during training

# Get the specific row (e.g., row at index 0) as input data
specific_row_index = 2041
specific_row_data = features.iloc[specific_row_index]  # A single row as a Series

# Convert the row data to a 2D array (needed for prediction)
input_data = specific_row_data.values.reshape(1, -1)

# Make predictions using the loaded model
prediction = loaded_model.predict(input_data)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

print(specific_row_data)

    # If statement to check the age
if prediction == 0:
    print("Patient will not have diabetes.")
else:
    print("Patient will have diabetes.")

# print("Prediction:", prediction)


Patient_ID    1.001000e+15
Age           4.000000e+01
SBP           1.210000e+02
BMI           2.480000e+01
LDL           2.600000e+00
HDL           1.500000e+00
TG            8.000000e-01
FBS           5.100000e+00
Name: 2041, dtype: float64
Patient will not have diabetes.




In [138]:
import pandas as pd
import joblib

# Load the trained model from a pickle file
loaded_model = joblib.load('diab_pickle.pkl')

# Assuming you have features in X_train and target in y_train
X_train = df.drop('Diabetes', axis=1)  # Features used during training

# Get the specific row (e.g., row at index 0) as input data
specific_row_index = 2041
specific_row_data = X_train.iloc[specific_row_index]  # A single row as a Series

# Convert the row data to a 2D array (needed for prediction)
input_data = specific_row_data.values.reshape(1, -1)

# Make predictions using the loaded model
prediction = loaded_model.predict(input_data)

print("Prediction:", prediction)


Prediction: [0]


