# Good and Fast! (BreakoutRoom #1)

This team can utilize all data and any model, but are limited to **only using 3 features** in their final model.

In [1]:
import pandas as pd
df = pd.read_csv('Diabetes_Data/diabetes_full_train.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 3 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               638 non-null    int64  
 1   Glucose                   638 non-null    int64  
 2   BloodPressure             638 non-null    int64  
 3   SkinThickness             638 non-null    int64  
 4   Insulin                   638 non-null    int64  
 5   BMI                       638 non-null    float64
 6   DiabetesPedigreeFunction  638 non-null    float64
 7   Age                       638 non-null    int64  
 8   Outcome                   638 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 49.8 KB


In [8]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
for col in df.columns:
    print(df[col].value_counts())

1     111
0      95
2      85
3      61
4      59
5      46
6      42
7      39
8      29
9      26
10     20
13      8
11      7
12      6
14      2
15      1
17      1
Name: Pregnancies, dtype: int64
99     15
129    12
95     12
112    12
111    12
       ..
172     1
169     1
160     1
159     1
199     1
Name: Glucose, Length: 132, dtype: int64
70     52
74     42
72     37
64     37
78     36
68     35
60     34
0      31
76     30
80     29
62     28
82     25
66     23
90     21
88     19
58     18
86     17
84     17
54     10
56     10
50      9
52      9
92      8
94      6
65      6
75      6
85      6
48      5
44      4
96      4
106     3
100     3
108     2
104     2
46      2
98      2
30      2
61      1
55      1
102     1
110     1
40      1
38      1
24      1
122     1
Name: BloodPressure, dtype: int64
0     194
32     27
30     22
27     19
18     18
23     18
33     17
28     16
31     16
22     15
19     14
39     14
15     13
26     13
29     13
36     13
35 

In [7]:
# Do your magic!

X = df.drop(columns = 'Outcome')
y = df['Outcome']

#### Logistic Regression + Pipeline

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [15]:
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)]

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())]) # penalty = 'l2', class_weight = 'balanced'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model traing score: %.3f" % clf.score(X_train, y_train))
print("model test score: %.3f" % clf.score(X_test, y_test))

SyntaxError: invalid syntax (<ipython-input-15-714202ad26ff>, line 10)

#### Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [17]:
clf = RandomForestClassifier(n_estimators=100,
                             criterion='gini',
                             max_features='auto',
                             oob_score=True)

In [21]:
X = df[['Glucose','Pregnancies','BMI']]
y = df['Outcome']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
scaler = StandardScaler()
X_train_ss = scaler.fit_transform(X_train)
X_test_ss = scaler.transform(X_test)

In [34]:
clf.fit(X_train_ss, y_train)
print(clf.score(X_train_ss, y_train))
print(clf.score(X_test_ss, y_test))

0.9980392156862745
0.7265625


In [37]:
train_pred = clf.predict_proba(X_train_ss)
test_pred = clf.predict_proba(X_test_ss)
train_score = roc_auc_score(y_train, train_pred[:,1])
test_score = roc_auc_score(y_test, test_pred[:,1])

print(f'Train ROC-AUC score:{train_score}')
print(f'Test ROC-AUC score:{test_score}')

Train ROC-AUC score:0.9999914477285167
Test ROC-AUC score:0.7700410396716825


#### Grid Search + Decision Tree

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
dt = DecisionTreeClassifier()
grid_params = {
    'max_depth': [3,4,5,6,7],
    'max_features':['auto','sqrt', 'log2'],
    'class_weight':[None, 'balanced'],
    'criterion':['gini','entropy']
}

grid_search = GridSearchCV(dt, grid_params, cv=4, scoring='roc_auc')
grid_search.fit(X_train, y_train)

grid_search.best_estimator_ # best_params_

dt_grid = grid_search.best_estimator_

In [3]:
# Then use your model to predict the outcomes of the holdout_df
holdout_df = pd.read_csv('Diabetes_data/holdout_df.csv')

In [14]:
# And store those outcomes in the 'Outcome' column of this submission_df 
submission_df = pd.read_csv('Diabetes_Data/submission_df.csv')