# Good and Fast! (BreakoutRoom #1)

This team can utilize all data and any model, but are limited to **only using 3 features** in their final model.

In [65]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc


df = pd.read_csv('Diabetes_Data/diabetes_full_train.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1


In [66]:
# I made something for y'all

def eval_classification(model, model_name,
                        X_tr, X_te, y_tr, y_te,
                        to_print=False):
    '''
    Finds predictions for train and test sets, then
    prints metrics for classification nicely

    Inputs:
    model : already-fit sklearn model
    model_name : string, name for index for output df
    X_tr : training X (can be scaled, that's fine)
    X_te : testing X
    y_tr : training target
    y_te : testing target
    to_print : boolean, will print output nicely if True

    Outputs:
    metric_df - pandas Dataframe showing output
    '''
    
    metrics = {"Accuracy": accuracy_score,
               "Recall": recall_score,
               "Precision": precision_score,
               "F1-Score": f1_score,
               "ROC-AUC": roc_auc_score}

    y_pred_tr = model.predict(X_tr)
    y_pred_te = model.predict(X_te)

    # Defining the column names based on the metric dict keys
    col_list = []  # Starting a list
    for name in metrics.keys():
        col_list.append(f"{name.lower()}_train")
        col_list.append(f"{name.lower()}_test")

    metric_df = pd.DataFrame(columns=col_list)

    for name, metric_function in metrics.items():
        tr_col = f"{name.lower()}_train"
        metric_df.at[model_name, tr_col] = metric_function(y_tr, y_pred_tr)
        te_col = f"{name.lower()}_test"
        metric_df.at[model_name, te_col] = metric_function(y_te, y_pred_te)
        
        
        # Adding to-print option to print the metrics nicely
        if to_print:
            print(f"{name}:"); print("="*len(name))
            print(f"TRAIN: {metric_function(y_tr, y_pred_tr):.4f}")
            print(f"TEST: {metric_function(y_te, y_pred_te):.4f}")
            print("*" * 15)
    
    return metric_df

In [36]:
y = df.Outcome

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Outcome']),
                                                            y, 
                                                            test_size=.33, 
                                                            random_state=42, 
                                                            stratify=y)


In [37]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
lr = LogisticRegression()
lr_lasso = LogisticRegression(penalty='l2', class_weight='balanced')
model = lr_lasso.fit(X_train_scaled, y_train)

In [40]:
from sklearn.metrics import roc_auc_score

train_pred = model.predict_proba(X_train_scaled)
test_pred = model.predict_proba(X_test_scaled)
train_score = roc_auc_score(y_train, train_pred[:,1])
test_score = roc_auc_score(y_test, test_pred[:,1])

print(f'Train ROC-AUC score:{train_score}')
print(f'Test ROC-AUC score:{test_score}')

Train ROC-AUC score:0.8491807287845439
Test ROC-AUC score:0.8326338928856916


In [42]:
# Check Coefficients
coef_dict = {}
for coef, feat in zip(list(model.coef_[0]),X_train.columns.tolist()):
    coef_dict[feat] = coef
    
# Convert to dataframe so you can sort it
coef_df = pd.DataFrame.from_dict(coef_dict, columns =['Coef'], orient='index')
coef_df.sort_values(by=['Coef'], ascending = False)


Unnamed: 0,Coef
Glucose,1.297736
Pregnancies,0.543117
BMI,0.431969
DiabetesPedigreeFunction,0.251678
Age,0.183171
SkinThickness,0.175259
Insulin,-0.306614
BloodPressure,-0.328374


In [45]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df[['Glucose', 'Pregnancies', 'BMI']],y, 
                                                            test_size=.33, 
                                                            random_state=42, 
                                                            stratify=y)

In [46]:
X_train_new_scaled = scaler.fit_transform(X_train_new)
X_test_new_scaled = scaler.transform(X_test_new)

In [48]:
lr = LogisticRegression(class_weight='balanced')
model_new = lr.fit(X_train_new_scaled, y_train)

In [51]:
train_pred_new = model_new.predict_proba(X_train_new_scaled)
test_pred_new = model_new.predict_proba(X_test_new_scaled)
train_score_new = roc_auc_score(y_train, train_pred_new[:,1])
test_score_new = roc_auc_score(y_test, test_pred_new[:,1])

print(f'Train ROC-AUC score:{train_score_new}')
print(f'Test ROC-AUC score:{test_score_new}')

Train ROC-AUC score:0.8354487649792125
Test ROC-AUC score:0.8314348521183054


In [72]:
eval_classification(model, '8 features', X_train_scaled, X_test_scaled, y_train, y_test)

Unnamed: 0,accuracy_train,accuracy_test,recall_train,recall_test,precision_train,precision_test,f1-score_train,f1-score_test,roc-auc_train,roc-auc_test
8 features,0.770492,0.748815,0.751724,0.694444,0.637427,0.617284,0.689873,0.653595,0.765933,0.735711


In [73]:
eval_classification(model_new, '3 features', X_train_new_scaled, X_test_new_scaled, y_train, y_test)

Unnamed: 0,accuracy_train,accuracy_test,recall_train,recall_test,precision_train,precision_test,f1-score_train,f1-score_test,roc-auc_train,roc-auc_test
3 features,0.758782,0.748815,0.724138,0.652778,0.625,0.626667,0.670927,0.639456,0.750367,0.725669


In [74]:
# Then use your model to predict the outcomes of the holdout_df
holdout_df = pd.read_csv('Diabetes_data/holdout_df.csv')

In [78]:
X = holdout_df[['BMI', 'Pregnancies', 'Glucose']]

In [79]:
X_scaled = scaler.transform(X)

In [80]:
y_predict = model_new.predict(X_scaled)

In [87]:
pd.DataFrame(y_predict)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,0
...,...
125,1
126,0
127,1
128,1


In [100]:
# And store those outcomes in the 'Outcome' column of this submission_df 
submission_df = pd.read_csv('Diabetes_Data/submission_df.csv', index_col='Unnamed: 0')

In [101]:
submission_df

Unnamed: 0,Outcome
540,
307,
745,
691,
564,
...,...
309,
467,
755,
305,


In [102]:
submission_df['Outcome'] = y_predict

In [103]:
submission_df

Unnamed: 0,Outcome
540,1
307,1
745,1
691,1
564,0
...,...
309,1
467,0
755,1
305,1
