In [1]:
import pandas as pd
import numpy as np
import os

from pandas.api.types import is_categorical_dtype, is_numeric_dtype
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,f1_score

In [2]:
os.getcwd()

'C:\\MyInterest\\INSOFE_DataScience\\Datasets\\Titanic_Competition'

In [3]:
#USER DEFINED FUNCTIONS

#Function to read a file
def readfile(filename, filetype, header_idx):
    if(filetype=='csv'):
        data = pd.read_csv(filename, header=header_idx)
    return data

#Function to get the basic stats from the data
def describe_data(dataset):
    print("Dimensions \n")
    print(dataset.shape)
    print('\n\n')
    print("Column names\n")
    print(dataset.columns)
    print('\n\n')
    print("Data Types\n")
    print(dataset.dtypes)
    print('\n\n')
    print("Unique values in each level\n")
    for i in dataset.columns:
        print("{} - {}".format(i,len(dataset[i].unique())))
    print('\n\n')
    print("Summary \n")
    print(dataset.describe())
    print('\n\n')
    print("Top 5 rows\n")
    print(dataset.head())
    print('\n\n')
    
#Function to drop the not useful columns from the dataset
def drop_columns(dataset,col_list):
    dataset.drop(col_list,axis=1,inplace=True)
    return dataset
    
#Finding the count of missing values
def findNAs(dataset):
    print(dataset.isnull().sum())
    
#Function to convert the column datatypes
def datatype_transformer(col_list, coltype, dataset):
    for i in col_list:
        dataset[i] = dataset[i].astype(coltype)
    return dataset

#Function to splitting Numerical & Categorical Columns
def col_split(dataset):
    category_cols=list(dataset.select_dtypes('category').columns)
    numeric_cols=list(dataset.select_dtypes(['int64','float64']).columns)
    #category_cols.pop() #to remove the target_col
    return category_cols, numeric_cols

#Function to remove the target col from the list
def remove_target_frm_collist(cat_cols,num_cols,target_col,dataset):
    if(is_categorical_dtype(dataset[target_col])):
        for idx,col in enumerate(cat_cols):
            if(col==target_col):
                cat_cols.pop(idx)
    elif(is_numeric_dtype(dataset[target_col])):
        for idx,col in enumerate(num_cols):
            if(col==target_col):
                num_cols.pop(idx)
    

#Function to do Train:Validation Split
def train_validate_split(dataset,target_col,train_percentage):
    y = dataset[target_col]
    x = dataset.drop(target_col, axis=1)
    X_train, X_val, Y_train, Y_val = train_test_split(x, y, train_size=train_percentage)
    return X_train, X_val, Y_train, Y_val

#Function to create Transformers for Numerical & Categorical data
def create_transformers(num_imputestrategy,cat_imputestrategy,num_cols,cat_cols):
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=num_imputestrategy)),
        ('scaler', StandardScaler())
    ])

    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=cat_imputestrategy)),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ])

    num_cat_combiner = ColumnTransformer(transformers=[
        ('num',num_transformer,num_cols),
        ('cat',cat_transformer,cat_cols)
    ])
    return num_cat_combiner


In [4]:
train = readfile("train.csv",'csv',0)
test=readfile("test.csv",'csv',0)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
describe_data(train)

Dimensions 

(891, 12)



Column names

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')



Data Types

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



Unique values in each level

PassengerId - 891
Survived - 2
Pclass - 3
Name - 891
Sex - 2
Age - 89
SibSp - 7
Parch - 7
Ticket - 681
Fare - 248
Cabin - 148
Embarked - 4



Summary 

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.00

In [6]:
describe_data(test)

Dimensions 

(418, 11)



Column names

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')



Data Types

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



Unique values in each level

PassengerId - 418
Pclass - 3
Name - 418
Sex - 2
Age - 80
SibSp - 7
Parch - 8
Ticket - 363
Fare - 170
Cabin - 77
Embarked - 3



Summary 

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000

In [7]:
cols_to_drop = ['PassengerId','Name']
train = drop_columns(train,cols_to_drop)
test_passengerid = test['PassengerId']
test = drop_columns(test,cols_to_drop)

In [8]:
train.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [9]:
test.dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [187]:
findNAs(train)

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


In [188]:
findNAs(test)

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64


In [189]:
category_columns = ['Pclass','Sex','SibSp','Parch','Embarked','Survived']

train = datatype_transformer(category_columns,'category',train)
test = datatype_transformer(category_columns[0:5],'category',test)

#Cabin & Ticket columns are yet to be considered

In [190]:
train.dtypes

Survived    category
Pclass      category
Sex         category
Age          float64
SibSp       category
Parch       category
Ticket        object
Fare         float64
Cabin         object
Embarked    category
dtype: object

In [191]:
test.dtypes

Pclass      category
Sex         category
Age          float64
SibSp       category
Parch       category
Ticket        object
Fare         float64
Cabin         object
Embarked    category
dtype: object

In [192]:
cat_cols,num_cols = col_split(train)
print(cat_cols)
print(num_cols)

['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
['Age', 'Fare']


In [193]:
print(cat_cols)
print(num_cols)

remove_target_frm_collist(cat_cols,num_cols,'Survived',train)

print(cat_cols)
print(num_cols)


['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
['Age', 'Fare']
['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
['Age', 'Fare']


In [194]:
#To check class imbalance
train['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [195]:
X_train,X_val,Y_train,Y_val = train_validate_split(train, 'Survived',0.7)

In [196]:
print(X_train.shape)
print(X_val.shape)
print(Y_train.shape)
print(Y_val.shape)

(623, 9)
(268, 9)
(623,)
(268,)


In [197]:
num_cat_combiner = create_transformers('median','most_frequent',num_cols,cat_cols)

In [198]:
model = Pipeline(steps=[
    ('preprocessor',num_cat_combiner),
    ('rf_clf',RandomForestClassifier())
])

In [199]:
model.fit(X=X_train,y=Y_train)



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [200]:
Y_train_pred = model.predict(X_train)
Y_val_pred = model.predict(X_val)

In [201]:
print(accuracy_score(Y_train,Y_train_pred))
print(accuracy_score(Y_val,Y_val_pred))
print(f1_score(Y_train,Y_train_pred))
print(f1_score(Y_val,Y_val_pred))

0.9678972712680578
0.7761194029850746
0.9567099567099568
0.7


In [202]:
grid_params_rf = [{'rf_clf__criterion': ['gini', 'entropy'],
                   'rf_clf__n_estimators': [10,30,50],
                   'rf_clf__min_samples_leaf': [10,15,20],
                   'rf_clf__max_depth': [100,150],
                   'rf_clf__min_samples_split': [20,30]}]

In [203]:
jobs=-1

gs_rf = GridSearchCV(estimator=model,
            param_grid=grid_params_rf,
            scoring='accuracy',
            cv=10, 
            n_jobs=jobs)

In [204]:
gs_rf.fit(X_train,Y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                   

In [205]:
gs_rf.best_estimator_.steps[1][1]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=150, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [206]:
Y_train_pred = gs_rf.predict(X_train)
Y_val_pred = gs_rf.predict(X_val)

In [207]:
print(accuracy_score(Y_train,Y_train_pred))
print(accuracy_score(Y_val,Y_val_pred))
print(f1_score(Y_train,Y_train_pred))
print(f1_score(Y_val,Y_val_pred))

0.8298555377207063
0.8097014925373134
0.7363184079601991
0.7272727272727272


In [208]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
3,3,male,27.0,0,0,315154,8.6625,,S
4,3,female,22.0,1,1,3101298,12.2875,,S


In [209]:
sel_cols = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
X_test = test[sel_cols]

Y_test_pred = model.predict(X_test)

In [210]:
output = pd.DataFrame({'PassengerId': test_passengerid, 'Survived': Y_test_pred})
output.to_csv('my_submission_RF_2.csv', index=False)
print("Predictions are saved..!")

Predictions are saved..!
