In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Read the csv file from the Resources folder into a Pandas DataFrame

data = Path("Diabetes_Prediction_Medical.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,Diabetes_011,Stroke,HeartDiseaseorAttack,AnyHealthcare,GenHlth,MentHlth,PhysHlth
0,0,0,0,1,5,18,15
1,0,0,0,0,3,0,0
2,0,0,0,1,5,30,30
3,0,0,0,1,2,0,0
4,0,0,0,1,2,3,0


In [3]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = df['Diabetes_011']

# The X variable should include all features except the target
X = df.drop(columns=['Diabetes_011'])

In [4]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [6]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [7]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[156127   2108]
 [ 27325   1938]]


In [8]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[52026   668]
 [ 9109   697]]


In [9]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.85      0.99      0.91    158235
           1       0.48      0.07      0.12     29263

    accuracy                           0.84    187498
   macro avg       0.67      0.53      0.52    187498
weighted avg       0.79      0.84      0.79    187498



In [10]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.85      0.99      0.91     52694
           1       0.51      0.07      0.12      9806

    accuracy                           0.84     62500
   macro avg       0.68      0.53      0.52     62500
weighted avg       0.80      0.84      0.79     62500

