In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [4]:
# Read the csv file from the Resources folder into a Pandas DataFrame

data = Path("Diabetes_Prediction_Bio.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,Diabetes_011,NoDocbcCost,Sex,Age,Education
0,0,0,0,9,4
1,0,1,0,7,6
2,0,1,0,9,4
3,0,0,0,11,3
4,0,0,0,11,5


In [5]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = df['Diabetes_011']

# The X variable should include all features except the target
X = df.drop(columns=['Diabetes_011'])

In [6]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [8]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [9]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[158037    258]
 [ 29070    133]]


In [10]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[52542    92]
 [ 9819    47]]


In [11]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.84      1.00      0.92    158295
           1       0.34      0.00      0.01     29203

    accuracy                           0.84    187498
   macro avg       0.59      0.50      0.46    187498
weighted avg       0.77      0.84      0.77    187498



In [12]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.84      1.00      0.91     52634
           1       0.34      0.00      0.01      9866

    accuracy                           0.84     62500
   macro avg       0.59      0.50      0.46     62500
weighted avg       0.76      0.84      0.77     62500

