## Introduction

The goal is to build a Machine Learning model to predict if a given adult's yearly income is above or below $50k.

### 1. Load data and inspect data

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../data/income_data.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# check Missing values in the dataset 
df.isnull().sum().sort_values(ascending = False)

In [None]:
round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)

In [None]:
total=df.isnull().sum().sort_values(ascending = False)
percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])

###  2. Visualize your data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
fig, ax = plt.subplots(1,1)
ax = df.income_status.value_counts().plot(kind='bar')
ax.set_title("Income status \n (0 < 50K, 1 >50K)")
ax.set_ylabel("Counts");

In [None]:
#Gender and income
ax = sns.countplot(x = "sex", 
                   hue="income_status",
                   data = df, 
                   linewidth=2
)
plt.title("Income status vs Sex \n (0 < 50K, 1 >50K)")
plt.ylabel(" No of Patients")
plt.legend()

In [None]:
ax = sns.countplot(x = "workclass", 
                   hue="income_status",
                   data = df, 
                   linewidth=2
)
plt.title("Income status vs workclass \n (0 < 50K, 1 >50K)")
plt.ylabel(" No of Patients")
plt.xticks(rotation=90)
plt.legend();

In [None]:
ax = sns.countplot(x = "marital-status", 
                   hue="income_status",
                   data = df, 
                   linewidth=2
)
plt.title("Income status vs marital-status \n (0 < 50K, 1 >50K)")
plt.ylabel("No of Patients")
plt.xticks(rotation=90)
plt.legend();

In [None]:
column = "age"
ax=sns.kdeplot(df.loc[(df['income_status'] == 0),column] , color='gray' ,shade=True,label='$<50K$')
ax=sns.kdeplot(df.loc[(df['income_status']  == 1),column] , color='g',shade=True, label='$>50K$')
plt.title('Age distribution:1 vs 0')
plt.ylabel("Frequency")
plt.xlabel("Years")

### 3. Build data-pipeline

In [None]:
## create train and test set
y=df['income_status']
X = df.drop(['income_status'],axis=1)

In [None]:
X_train=X.sample(frac=0.85,random_state=200)
X_test=X.drop(X_train.index)
y_train = y.loc[X_train.index]
y_test  = y.loc[X_test.index]

In [None]:
fig, ax = plt.subplots(1,1)
ax = y_test.value_counts().plot(kind='bar')
ax.set_title("Test Income status \n (0 < 50K, 1 >50K)")
ax.set_ylabel("Counts");

In [None]:
## check data distribution
fig, ax = plt.subplots(1,1)
ax = y_train.value_counts().plot(kind='bar')
ax.set_title("Train Income status \n (0 < 50K, 1 >50K)")
ax.set_ylabel("Counts");

In [None]:
### data pipeline

In [None]:
# feature pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
categ_column=["native-country", "workclass", "education", "marital-status", "race" ,
              "occupation", "relationship"]
numeric_column=["age", "education-num", 'fnlwgt', 'capital-gain', "capital-loss", "hours-per-week"]

In [None]:
def data_pipeline(numeric_feature, categorical_feature):
       
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                            ('scaler', StandardScaler())])


    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_feature),
                     ('cat', categorical_transformer, categorical_feature)])
    return preprocessor

In [None]:
df_pipeline=data_pipeline(numeric_column, categ_column)

### 4. Build and train ML model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression()

In [None]:
clf = Pipeline(steps=[('preprocessor', df_pipeline), ('classifier', lg)])

In [None]:
clf.fit(X_train, y_train)

### 5.Evaluate your model

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score

In [None]:
y_pred_train = clf.predict(X_train)
y_pred_test  = clf.predict(X_test)

In [None]:
cm_train = confusion_matrix(y_train.values, y_pred_train)
cm_test = confusion_matrix(y_test.values, y_pred_test)

f_1_tra = f1_score(y_true=y_train.values, y_pred=y_pred_train)
f_1_test = f1_score(y_true=y_test.values, y_pred=y_pred_test)

In [None]:
print(f"Train score:{f_1_tra}: Test score:{f_1_test}")

In [None]:
print(f"Train cm:\n {cm_train}")

In [None]:
print(f"Test cm:\n {cm_test}")

### 6. Cross validation and model selection

In [None]:
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from time import time

In [None]:
models = {"KNN": KNeighborsClassifier(),
          "LG":LogisticRegression(),
          "RF":RandomForestClassifier(n_estimators=100),
          "GB":GradientBoostingClassifier(n_estimators=100),
          "DT":DecisionTreeClassifier()
          }

In [None]:
def find_best_model(X, y, models, data_pipeline,  score='f1', cv=3):
    
    mean_score = {}
    std_score  = {}
    model_time = {}
    
    skfold = model_selection.StratifiedKFold(n_splits=cv)
    
    for model_name, model in models.items():
        print("fit {} model".format(model_name))
        clf = Pipeline(steps=[('preprocessor', data_pipeline),('classifier', model)])
        start = time()
        cv_results = model_selection.cross_val_score(clf, X, y, cv=skfold, scoring=score)
        end = time()
        mean_score[model_name] = cv_results.mean()
        std_score[model_name]  =  cv_results.std()
        model_time[model_name] = end - start
        
        print("{0}:score: {1:.3f}+-{2:.3f}:time {3:.3f}".format(model_name,mean_score[model_name],
                                            std_score[model_name], model_time[model_name]  ))
    return mean_score, std_score, model_time

In [None]:
mean_score, std_score, model_time  = find_best_model(X_train, y_train, models, df_pipeline)

In [None]:
def plotbar(plot_name, names, result,title,ylabel):
    postion = np.arange(len(names))
    plt.bar(postion, result, align='center', color ='g')
    plt.axhline(0.65, color="r", lw=1)
    plt.xticks(postion, names, rotation=90)
    plt.ylabel(ylabel)
    plt.ylim(0,1)
    plt.title(title)

In [None]:
plotbar("cv_score", list(models.keys()), list(mean_score.values()), "F-1 Score Performance", "F-Score \n (Higher is better)")

### 7. Parameter search

In [None]:
lg_parameters = {"classifier__C":np.logspace(0, 4, 10),
                "classifier__penalty": ['l1', 'l2']
}

gb_parameters = {
    "classifier__loss":["deviance"],
    "classifier__learning_rate": [0.01, 0.025, 0.075, 0.1],
    "classifier__max_depth":[3,5,8],
    "classifier__max_features":["log2","sqrt"],
    "classifier__n_estimators":[10, 100]
    }

parameters = {"LG": lg_parameters, "GB": gb_parameters}
models = {
          "LG":LogisticRegression(),
          "GB":GradientBoostingClassifier(),
          }

In [None]:
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

def parameters_search(X_train, y_train, X_test, y_test,  models, parameters, data_pipeline,  score='f1', cv=3):
    
    tra_score = {}
    test_score = {}
    best_parameters = {}
    for model_name, model in models.items():
        print("fit {} model".format(model_name))
        clf = Pipeline(steps=[('preprocessor', data_pipeline),('classifier', model)])
        clfs = GridSearchCV(clf, parameters[model_name], cv=cv, scoring='f1_micro')
        clfs.fit(X_train, y_train)
        be = clfs.best_estimator_
        y_pred_train = be.predict(X_train)
        y_pred_test  = be.predict(X_test)
        tra_score[model_name] = f1_score(y_true=y_train.values, y_pred=y_pred_train)
        test_score[model_name] = f1_score(y_true=y_test.values, y_pred=y_pred_test)
        best_parameters[model_name] = clfs.best_params_
        print("{0}: Tra best score: {1:.3f}:Test best score {2:.3f}".format(model_name,tra_score[model_name],
                                            test_score[model_name]))
        joblib.dump(clfs.best_estimator_, '../models/{}.pkl'.format(model_name))
        
    return tra_score, test_score, best_parameters



In [None]:
tra_score, test_score, best_parameters=parameters_search(X_train, y_train, X_test, y_test,  models, parameters, df_pipeline,  score='f1', cv=3)

best_parameters  

In [None]:
best_parameters["GB"]