### train_model.py (5 points) 
When this is called using python train_model.py in the command line, this will take in the training dataset csv, perform the necessary data cleaning and imputation, and fit a classification model to the dependent Y. There must be data check steps and clear commenting for each step inside the .py file. The output for running this file is the random forest model saved as a .pkl file in the local directory. Remember that the thought process and decision for why you chose the final model must be clearly documented in this section

In [1]:
# The following libraries will be used to answer this question.
import pandas as pd
import pickle
import os
from os import path

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# There are some warnings about future changes to the packages. Those warnings will be suppressed.
import warnings
warnings.filterwarnings("ignore")

In [2]:
# A function to fit the classifier
def fit_model(X, y):
    try:
        # Missing values are imputed using most frequent.
        # Embarked is the only variable with missing values after data cleaning function is run.
        # S factor has the most occurrence. 
        imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
        
        # As per recommendation, random forest classifier is initialized
        rfc = RandomForestClassifier()
        
        # Next, steps and pipeline are initialized
        steps = [('imputation', imputer), ('random_forest', rfc)]
        pipeline = Pipeline(steps)
        
        # Data is split in 70:30 ratio and stored in X and y variables. 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 104)
        X_test.to_csv('X_test.csv')
        y_test.to_csv('y_test.csv')
        
        # Finally, fit the data.
        model = pipeline.fit(X_train, y_train)
        
    except:
        # Display message if the there's a failure.
        print("Check pipeline setup, fit or csv file save.")
        
    return(model)

In [3]:
# The following function cleans the dataset.
def clean_data(df):
    dummy_vars = ['Sex', 'Embarked', 'Pclass']
    drop_vars = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'family_size', 'female', 'C', 1]
    
    # Sex, Passenger Class, Embarked are categorical variables. 
    # Therefore, they are dummified and one of the variable from each are dropped.
    for col in dummy_vars:
        new_vars = pd.get_dummies(df[col])
        df = df.join(new_vars)
    
    # A new variable is created from siblings and parents variables 
    for col in df:
        df['family_size'] = df['SibSp'] + df['Parch'] + 1
        
    for col in df:
        df['is_alone'] = 0
        df.loc[df['family_size'] == 1, 'is_alone'] = 1
    
    # Age variables has several missing values. It will be filled with the mean of the age variable.
    for col in df:
        df['Age'].fillna(df['Age'].mean(), inplace = True)    
        
    # Several variables are dropped as they are not needed or has high number of missing values.    
    df = df.drop(dummy_vars, axis = 1)
    return df.drop(drop_vars, axis = 1)

In [4]:
# The following function reads the data and runs the two functions above and saves the model object.
def train_model(x):
    # Reads and runs the clean_data function
    df = pd.read_csv(x + ".csv")
    clean_df = clean_data(df)
    
    # Remove the dependent variable to fit the model
    y = clean_df['Survived']
    X = clean_df.drop('Survived', axis=1)
    
    # Runs the fit_model function on the cleaned data.
    model = fit_model(X,y)
    
    # Saves the model object locally using the pickle package.
    p = open('model.pkl', 'wb')
    pickle.dump(model, p)
    p.close()
    
    # Check is the object is saved locally.
    print("Model Pickle Created: " + str(path.exists('model.pkl')))

In [5]:
# Test if the functions work
train_model('train')

Model Pickle Created: True
