In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [36]:
data = pd.read_csv("./diabetes_data.csv")
df = data.copy()


In [37]:
df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
def grab_col_names(dataframe, cat_th=15, car_th=20):
    """

    Returns the names of the variables in the data set that are categorical, numeric and seemingly categorical but are cardinal.
    Note: Categorical variables include numeric-appearing categorical variables.

    Parameters
    ------
        dataframe: dataframe
                Dataframe to get variable names
        cat_th: int, optional
                class threshold for numeric but categorical variables
        car_th: int, optinal
                class threshold for categorical but cardinal variables

    Returns
    ------
        cat_cols: list
                List of categorical variables
        num_cols: list
                List of numerical variables
        cat_but_car: list
                List of cardinal variables with categorical appearance

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = total number of variables
        num_but_cat is inside cat_cols.
        The sum of 3 lists with return is equal to the total number of variables: cat_cols + num_cols + cat_but_car = number of variables

    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    # print(f"Observations: {dataframe.shape[0]}")
    # print(f"Variables: {dataframe.shape[1]}")
    # print(f'cat_cols: {len(cat_cols)}')
    # print(f'num_cols: {len(num_cols)}')
    # print(f'cat_but_car: {len(cat_but_car)}')
    # print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

In [39]:
df.columns = [col.upper() for col in df.columns]

In [40]:
df.head()

Unnamed: 0,AGE,SEX,HIGHCHOL,CHOLCHECK,BMI,SMOKER,HEARTDISEASEORATTACK,PHYSACTIVITY,FRUITS,VEGGIES,HVYALCOHOLCONSUMP,GENHLTH,MENTHLTH,PHYSHLTH,DIFFWALK,STROKE,HIGHBP,DIABETES
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df.head()

Unnamed: 0,AGE,SEX,HIGHCHOL,CHOLCHECK,BMI,SMOKER,HEARTDISEASEORATTACK,PHYSACTIVITY,FRUITS,VEGGIES,HVYALCOHOLCONSUMP,GENHLTH,MENTHLTH,PHYSHLTH,DIFFWALK,STROKE,HIGHBP,DIABETES
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
from sklearn.pipeline import Pipeline

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
df['DIABETES'] = df['DIABETES'].astype(int)

In [50]:
df['DIABETES'].sum()

np.int64(35346)

In [51]:
y = df.pop('DIABETES')

In [52]:
y

0        0
1        0
2        0
3        0
4        0
        ..
70687    1
70688    1
70689    1
70690    1
70691    1
Name: DIABETES, Length: 70692, dtype: int64

In [54]:
X = df

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [63]:
from sklearn.preprocessing import StandardScaler

sd = StandardScaler()

In [64]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [65]:
pipeline = Pipeline([('Scaling', sd), ('lr', lr)])

In [66]:
pipeline.fit(X_train, y_train)

In [67]:
pipeline.score(X_test, y_test)

0.7475810558479036

In [68]:
pipeline.score(X_train, y_train)

0.7471661102623588

In [69]:
import joblib

In [70]:
joblib.dump(pipeline, 'pipeline.joblib')

['pipeline.joblib']

In [71]:
p = joblib.load('pipeline.joblib')

In [72]:
p.score(X_test, y_test)

0.7475810558479036