In [1]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from joblib import dump, load
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

sns.set_theme(style="ticks", palette="pastel")

def remove_outliers(df, col: str):
    q3 = np.quantile(df[col], 0.75)
    q1 = np.quantile(df[col], 0.25)
    l_lmt = q1 - 1.5 * (q3 - q1)
    u_lmt = q3 + 1.5 * (q3 - q1)
    return df[(df[col] >= l_lmt) & (df[col] <= u_lmt)].copy()
    
#Read Sample Dataset
raw_df = pd.read_csv('/Users/xiezhuoying/Desktop/NUS/DSSI/Day_2/dssi-py-main/diabetes.csv')
raw_df.head()

Unnamed: 0,sn,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148.0,72.0,35,0,33.6,0.627,50,1
1,2,1,85.0,66.0,29,0,26.6,0.351,31,0
2,3,8,183.0,64.0,0,0,23.3,0.672,32,1
3,4,1,89.0,66.0,23,94,28.1,0.167,21,0
4,5,0,137.0,40.0,35,168,43.1,2.288,33,1


In [2]:
#Check of missing values
raw_df.isnull().sum()

sn                           0
Pregnancies                  0
Glucose                      5
BloodPressure               35
SkinThickness                0
Insulin                      0
BMI                         11
DiabetesPedigreeFunction     0
Age                          0
Outcome                      0
dtype: int64

In [3]:
glucose_avg = raw_df['Glucose'].mean()
blood_pressure_avg = raw_df['BloodPressure'].mean()
bmi_avg = raw_df['BMI'].mean()

raw_df['Glucose'].fillna(glucose_avg, inplace=True)
raw_df['BloodPressure'].fillna(blood_pressure_avg, inplace=True)
raw_df['BMI'].fillna(bmi_avg, inplace=True)

print(raw_df.isnull().sum())

sn                          0
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
raw_df.describe(include='all')

Unnamed: 0,sn,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,384.5,3.845052,121.686763,72.405184,20.536458,79.799479,32.457464,0.471876,33.240885,0.348958
std,221.846794,3.369578,30.435949,12.096346,15.952218,115.244002,6.875151,0.331329,11.760232,0.476951
min,1.0,0.0,44.0,24.0,0.0,0.0,18.2,0.078,21.0,0.0
25%,192.75,1.0,99.75,64.0,0.0,0.0,27.5,0.24375,24.0,0.0
50%,384.5,3.0,117.0,72.202592,23.0,30.5,32.4,0.3725,29.0,0.0
75%,576.25,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,768.0,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
label = 'Outcome'

In [8]:
X_train, X_test, y_train, y_test = train_test_split(raw_df[features], \
                                                    raw_df[label], \
                                                    test_size=0.2, \
                                                    random_state=0)
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
603,7,150.0,78.0,29,126,35.2,0.692,54
118,4,97.0,60.0,23,0,28.2,0.443,22
247,0,165.0,90.0,33,680,52.3,0.427,23
157,1,109.0,56.0,21,135,25.2,0.833,23
468,8,120.0,72.405184,0,0,30.0,0.183,38


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)  # n_estimators 是树的数量

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7987012987012987
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       107
           1       0.68      0.64      0.66        47

    accuracy                           0.80       154
   macro avg       0.76      0.75      0.76       154
weighted avg       0.80      0.80      0.80       154

