In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings("ignore")
import json
import pickle

## Model Experimentation

In [2]:
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv'

In [3]:
mydata = pd.read_csv(url)

In [4]:
mydata.shape

(891, 12)

In [None]:
.25 = 0 
M<anpreet surviaval prob = .75
feature importance 
shap

In [5]:
mydata.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Features to be Used (Keeping them less for simplicity) :
* Pclass
* Sex
* Age
* SibSp
* Parch
* Fare
*Embarked

In [6]:
input_feat_list = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Survived']

In [8]:
mydata[input_feat_list].head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1


In [9]:
mydummies = pd.get_dummies(mydata[['Sex','Embarked']])

In [10]:
mydata = pd.concat([mydata[input_feat_list],mydummies],axis=1)

In [11]:
mydata.drop(columns=['Sex','Embarked'],inplace=True)

In [12]:
mydata.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,1,0,1,0,0
2,3,26.0,0,0,7.925,1,1,0,0,0,1
3,1,35.0,1,0,53.1,1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,0,1,0,0,1


In [13]:
mydata.isna().any()

Pclass        False
Age            True
SibSp         False
Parch         False
Fare          False
Survived      False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [14]:
mydata['Age'].fillna(mydata[mydata['Age'].notnull()]['Age'].median(),inplace=True)

In [15]:
mydata.isna().any()

Pclass        False
Age           False
SibSp         False
Parch         False
Fare          False
Survived      False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [16]:
train, test = mydata[['Pclass','Age','SibSp','Parch','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S']], mydata['Survived']


In [17]:
x_train,x_test,y_train,y_test = train_test_split(train,test, test_size=.2,stratify=test)

In [18]:
x_train.shape, x_test.shape

((712, 10), (179, 10))

In [19]:
x_train.isna().any()

Pclass        False
Age           False
SibSp         False
Parch         False
Fare          False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [20]:
name_zero_var = []
[name_zero_var.append(name) for name in x_train.columns if (round(np.var(x_train.loc[:,name]),2)==0)]
feature_set = list(set(x_train.columns) - set(name_zero_var))

In [21]:
name_zero_var

[]

In [22]:
mycorr = x_train.corr(method='pearson').abs()
upper = mycorr.where(np.triu(np.ones(mycorr.shape),k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column]>.95)]                 
feature_set = list(set(feature_set) - set(to_drop))

In [24]:
to_drop

['Sex_male']

In [23]:
feature_set

['Sex_female',
 'Embarked_S',
 'Age',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Pclass',
 'SibSp']

In [25]:
rf = RandomForestClassifier(n_estimators=1000,max_depth=10, 
                            max_features='sqrt', n_jobs=-1,
                            class_weight='balanced',
                            criterion='entropy', random_state=1)

In [26]:
rf_model=rf.fit(x_train[feature_set],y_train)

In [27]:
y_pred = rf_model.predict_proba(x_test[feature_set])[:,1]

In [28]:
roc_auc_score(y_test,y_pred)

0.8605401844532279

In [29]:
gbm = GradientBoostingClassifier(n_estimators=300,max_depth=4,
                                 learning_rate=.1,random_state=100)

In [30]:
gbm_model = gbm.fit(x_train[feature_set],y_train)

In [31]:
y_pred = gbm_model.predict_proba(x_test[feature_set])[:,1]

In [32]:
roc_auc_score(y_test,y_pred)

0.841699604743083

## Create a Custom Pipeline 

In [36]:
class DataPrep(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        return None;
        
        
    def transform(self,data,y=None):
        
        ## Handle the Null
        data['Age'].fillna(data[data['Age'].notnull()]['Age'].median(),inplace=True)
                
        return data
        
    def fit(self,data,y=None):
        return self
        

In [37]:
mypipe = Pipeline([ ('Prepare the Data',DataPrep()), 
                   ('Random Forest Model',rf_model) 
                  ],verbose=True)

### Save the pipeline, model and columns

In [40]:
pickle.dump(mypipe,open('mypipe_survival.pkl',"wb"))

In [101]:
pickle.dump(rf,open('rf_survival.pkl',"wb"))

In [91]:
model_columns = list(x_train[feature_set].columns)
model_columns

['Sex_female',
 'Embarked_S',
 'Age',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Pclass',
 'SibSp']

In [71]:
pickle.dump(model_columns, open('model_survival_columns.pkl',"wb"))

## Checking pipeline working 

In [12]:
rf = pickle.load(open('rf_survival.pkl','rb'))
model_columns = pickle.load(open('model_survival_columns.pkl','rb'))


In [4]:
check_json = {
    
            'Pclass' :3,
            'Sex':"female",
            'Age':28,
            'SibSp':0,
            'Parch':1,
            'Fare':54.23,
            'Embarked' :"Q"
}

In [5]:
pd.DataFrame(check_json,index=[0])

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,female,28,0,1,54.23,Q


In [6]:
check_json_2 = {
            'Pclass' :[0,1],
            'Sex':["female","male"],
            'Age':[12,45],
            'SibSp':[1,1],
            'Parch':[0,1],
            'Fare':[0,0],
            'Embarked' :["S","Q"]
}

In [7]:
pd.DataFrame(check_json_2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,female,12,1,0,0,S
1,1,male,45,1,1,0,Q


In [9]:
query = pd.get_dummies(pd.DataFrame(check_json_2))

In [10]:
query

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_Q,Embarked_S
0,0,12,1,0,0,1,0,0,1
1,1,45,1,1,0,0,1,1,0


In [13]:
## Load the columns that were used to train the model and 
## put 0 in other columns whose values are not present in query (in the previous step)

query = query.reindex(columns=model_columns, fill_value=0) 


In [14]:
## Now query is read to be consumed by the pipeline.

query

Unnamed: 0,Sex_female,Embarked_S,Age,Parch,Fare,Embarked_C,Embarked_Q,Pclass,SibSp
0,1,1,12,0,0,0,0,0,1
1,0,0,45,1,0,0,1,1,1


In [16]:
rf.predict_proba(query)[:,1]

array([0.76677792, 0.2631243 ])

In [17]:
type(rf.predict_proba(query)[:,1])

numpy.ndarray

In [18]:
from flask import jsonify

In [24]:
result = (rf.predict_proba(query)[:,1])

In [28]:
result

array([0.76677792, 0.2631243 ])

### IMPORTANT NOTE

* I can also do these steps in DataPrep class of my piepline. I would have to pass the model_columns as a variable in 'Prepare the Data',DataPrep() part of the pipleine as 'Prepare the Data',DataPrep(model_columns)

* So, class DataPrep would like :

    class DataPrep(BaseEstimator, TransformerMixin):
    
        def __init__(self,model_columns):
            self.model_columns = model_columns
            return None;
        
        
        def transform(self,data,y=None):
            data = pd.get_dummies(pd.DataFrame(data,index=[0]))
            data = data.reindex(columns=self.model_columns, fill_value=0) 

            data['Age'].fillna(data[data['Age'].notnull()]['Age'].median(),inplace=True)
            
            return data
        