In [1]:
#import dependencies
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#load files from resource into dataframe
data = Path('Resource/diabetes_data.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
#categorize type 1 and type2 diabete into patient with diabete
df1 = df[(df["Diabetes"]==0)]
df2 = df[(df["Diabetes"]==1)]
df2

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
35346,9.0,0.0,1.0,1.0,30.0,1.0,1.0,0.0,1.0,1.0,0.0,5.0,30.0,30.0,1.0,0.0,1.0,1.0
35347,13.0,1.0,0.0,1.0,25.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0
35348,11.0,0.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,1.0
35349,7.0,1.0,0.0,1.0,23.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
35350,13.0,0.0,0.0,1.0,27.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,6.0,0.0,1.0,1.0,37.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
70688,10.0,1.0,1.0,1.0,29.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0
70689,13.0,0.0,1.0,1.0,25.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,15.0,0.0,1.0,0.0,1.0,1.0
70690,11.0,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,1.0


In [6]:
#make even balance amount of data for patients without diabete to patients with diabete
no_to_drop = len(df1["Diabetes"])-len(df2["Diabetes"])
df1 = df1.drop(df1.index[:no_to_drop])
df1.reset_index(inplace=True,drop=True)
df1.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#concat both dataframe for patient with diabete and without diabete
df = pd.concat([df1,df2], axis=0, ignore_index= True)
df

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,6.0,0.0,1.0,1.0,37.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
70688,10.0,1.0,1.0,1.0,29.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0
70689,13.0,0.0,1.0,1.0,25.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,15.0,0.0,1.0,0.0,1.0,1.0
70690,11.0,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,1.0


In [9]:
sorted_corr = df.corr()['Diabetes'].sort_values(ascending=False)
sorted_corr

Diabetes                1.000000
GenHlth                 0.407612
HighBP                  0.381516
BMI                     0.293373
HighChol                0.289213
Age                     0.278738
DiffWalk                0.272646
PhysHlth                0.213081
HeartDiseaseorAttack    0.211523
Stroke                  0.125427
CholCheck               0.115382
MentHlth                0.087029
Smoker                  0.085999
Sex                     0.044413
Fruits                 -0.054077
Veggies                -0.079293
HvyAlcoholConsump      -0.094853
PhysActivity           -0.158666
Name: Diabetes, dtype: float64

In [10]:
#scale the dataframe beside diabete column
df_prep = df.drop(columns=["Diabetes"])
diabete_scaled = StandardScaler().fit_transform(df_prep)

In [11]:
#turn scaled dataset into dataframe
df_scaled = pd.DataFrame(diabete_scaled)
df_scaled.columns = df_prep.columns
df_scaled.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
0,-1.607237,1.090046,-1.052798,0.159276,-0.542176,-0.951711,-0.41647,0.649925,-1.255371,0.517485,-0.211251,0.146304,0.15302,2.404008,-0.581554,-0.257473,0.880201
1,1.197681,1.090046,0.94985,0.159276,-0.542176,1.05074,-0.41647,-1.538638,0.796577,-1.932424,-0.211251,0.146304,-0.460058,-0.577451,-0.581554,3.883895,0.880201
2,1.548296,1.090046,-1.052798,0.159276,-0.542176,-0.951711,-0.41647,0.649925,0.796577,0.517485,-0.211251,-1.649743,-0.460058,0.416369,-0.581554,-0.257473,-1.136104
3,0.847066,1.090046,0.94985,0.159276,-0.261036,1.05074,-0.41647,0.649925,0.796577,0.517485,-0.211251,0.146304,-0.460058,-0.279305,-0.581554,-0.257473,0.880201
4,-0.204778,-0.917392,-1.052798,0.159276,-0.120466,1.05074,-0.41647,0.649925,0.796577,0.517485,-0.211251,-0.751719,-0.460058,-0.577451,-0.581554,-0.257473,-1.136104


In [13]:
#prepare X and y dataset for the model
y = df["Diabetes"]
X = df_scaled
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(53019, 17)

In [14]:
# create logistic regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)

#fit the model
classifier.fit(X_train, y_train)

In [15]:
# create prediction
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(100)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,0.0,0.0
...,...,...
95,0.0,0.0
96,1.0,1.0
97,0.0,0.0
98,0.0,0.0


In [16]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.7460533016465796

In [17]:
# Display the confusion matrix for the test dataset.
confusion_matrix(y_test, predictions)

array([[6421, 2415],
       [2073, 6764]], dtype=int64)

# Second

In [11]:
y = df['Diabetes']

# The X variable should include all features except the target
X = df.drop(columns=['Diabetes','MentHlth','Smoker','Sex','Fruits','Veggies','HvyAlcoholConsump','PhysActivity'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [13]:
# create logistic regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)

#fit the model
classifier.fit(X_train, y_train)

In [14]:
# create prediction
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(100)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,0.0,0.0
...,...,...
95,0.0,0.0
96,1.0,1.0
97,1.0,0.0
98,0.0,0.0


In [15]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.7442426300005658