In [1]:
#Assignment 1 - Titanic Prediction Problem 
#Name : Shubham Deshmukh
#Email : x2020gfv@stfx.ca
#Student No. : 202006307

In [63]:

#------------------------Run this import section first------------------------
import pandas as pd
import numpy as np
import re

#Algorithm
from sklearn.ensemble import RandomForestClassifier
#-------------------------------------------------------------------------------

#-------------------------Specify the path of Data files------------------------

url_train_datafile = 'train.csv'
url_test_datafile = 'test.csv'
#-------------------------------------------------------------------------------



#Importing dataset CSV files

def importData(url_train_datafile, url_test_datafile):
   
    train_rawdata = pd.read_csv(url_train_datafile)
    test_rawdata = pd.read_csv(url_test_datafile)

    #Merging the two datasets and adding type column 1 = train , 0 = test and Survived = empty for unpredicted rows.
    train_rawdata['Type'] = 1
    test_rawdata['Type'] = 0
    test_rawdata['Survived'] = ''

    merged_train_test_data = train_rawdata.append(test_rawdata, ignore_index=False, verify_integrity=False, sort=None)
    #print(merged_train_test_data)

    return merged_train_test_data



#Data Processing
def processData (merged_train_test_data):

    #*******Remove NA or Add new values to absent datas in merged dataset********
    
    #Replacing S=0, C=1, Q=2 & Filling value to empty cells in embarked by most frequent(mode) value. 
    merged_train_test_data["Embarked"] = merged_train_test_data['Embarked'].replace(to_replace=['S', 'C','Q'], value=[0, 1, 2])
    freq_embarked_value= merged_train_test_data['Embarked'].mode()
    merged_train_test_data["Embarked"] = merged_train_test_data['Embarked'].fillna(int(freq_embarked_value)).astype(int)



    #Converting fare to int and replace NA with 0
    merged_train_test_data['Fare'] = merged_train_test_data['Fare'].fillna( merged_train_test_data['Fare'].median()).astype(int) 


    # Remove Cabin letter from Cabin and dropping the other suffix values from it
    cabinCode = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

    merged_train_test_data['Cabin'] = merged_train_test_data['Cabin'].fillna("U0")
    merged_train_test_data['Cabin'] = merged_train_test_data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    merged_train_test_data['Cabin'] = merged_train_test_data['Cabin'].map(cabinCode)
    merged_train_test_data['Cabin'] = merged_train_test_data['Cabin'].fillna(0).astype(int)
    
    
    
    #Filling absent values of 'Age' Column in the entire data set using median based on Sex of the Passenger
    merged_train_test_data["Age"].fillna(merged_train_test_data.groupby('Sex')['Age'].transform("median"), inplace=True)
    
    #print(merged_train_test_data['Age'])
    
    merged_train_test_data['Age'] = merged_train_test_data['Age'].astype(int)
    merged_train_test_data.loc[ merged_train_test_data['Age'] <= 11, 'Age'] = 0
    merged_train_test_data.loc[(merged_train_test_data['Age'] > 11) & (merged_train_test_data['Age'] <= 18), 'Age'] = 1
    merged_train_test_data.loc[(merged_train_test_data['Age'] > 18) & (merged_train_test_data['Age'] <= 22), 'Age'] = 2
    merged_train_test_data.loc[(merged_train_test_data['Age'] > 22) & (merged_train_test_data['Age'] <= 27), 'Age'] = 3
    merged_train_test_data.loc[(merged_train_test_data['Age'] > 27) & (merged_train_test_data['Age'] <= 33), 'Age'] = 4
    merged_train_test_data.loc[(merged_train_test_data['Age'] > 33) & (merged_train_test_data['Age'] <= 40), 'Age'] = 5
    merged_train_test_data.loc[(merged_train_test_data['Age'] > 40) & (merged_train_test_data['Age'] <= 66), 'Age'] = 6
    merged_train_test_data.loc[ merged_train_test_data['Age'] > 66, 'Age'] = 6
    
        
    
    #Transform Sex data as Male = 1 and Female = 0
    merged_train_test_data['Sex'] = merged_train_test_data['Sex'].replace(to_replace=['male', 'female'], value=[1, 0])
    

    #Extracting Title out of name column and dropping the name column
    titleCode = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "RareFemale": 5, "RareMale": 6}

    merged_train_test_data['Title'] = merged_train_test_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # RareMale and RareFemale for unkown titles 
    merged_train_test_data['Title'] = merged_train_test_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr','Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    merged_train_test_data['Title'] = merged_train_test_data['Title'].replace(['Mlle','Ms'], 'Miss')
    merged_train_test_data['Title'] = merged_train_test_data['Title'].replace('Mme', 'Mrs')
    # RareMale for Rare title with Sex = Male and  RareFemale for Rare title with Sex = Male
    merged_train_test_data.loc[(merged_train_test_data['Title'] == 'Rare') &  (merged_train_test_data['Sex'] == 0),'Title'] = "RareFemale"
    merged_train_test_data.loc[(merged_train_test_data['Title'] == 'Rare') &  (merged_train_test_data['Sex'] == 1),'Title'] = "RareMale"
    
    merged_train_test_data['Title'] = merged_train_test_data['Title'].map(titleCode)
    merged_train_test_data = merged_train_test_data.drop(['Name'], axis=1)

    
    #Dropping the Ticket Column since it has random values which cannot be used to create any pattern
    merged_train_test_data = merged_train_test_data.drop('Ticket',axis = 1)
    

    #*********************Creating New Features********************

    #Add familysize column 
    merged_train_test_data['FamilySize'] = merged_train_test_data['SibSp'] + merged_train_test_data['Parch'] + 1


    #Adding Fare per passanger 
    merged_train_test_data['FarePerPassanger'] = (merged_train_test_data['Fare']/(merged_train_test_data['FamilySize'])).astype(int)


    return merged_train_test_data



def divideData (merged_train_test_data):

    #finaldata = merged_train_test_data
    #Stating the Columns which are to be dropped 
    columns_toDrop = [ 'PassengerId','Survived','Type','Fare']

    #Type = 1 means the tuples are training data with survival data 
    X_train = merged_train_test_data.loc[merged_train_test_data['Type'] == 1]
    Y_train = X_train['Survived'].astype(int)
    X_train = X_train.drop(columns_toDrop,axis=1)

    #Type = 0 means the tuples are testing data without survival data
    X_test = merged_train_test_data.loc[merged_train_test_data['Type'] == 0]
    X_test = X_test.drop(columns_toDrop,axis=1)

    return X_train,Y_train,X_test


def applyModelRF(X_train,Y_train,X_test):

    #Random Forest Classifier with Parameter
   
    rfc = RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=1, min_samples_leaf=3, min_samples_split=10, random_state=42, n_estimators=1000).fit(X_train,Y_train)
    Y_prediction = rfc.predict(X_test)
    
    #print(Y_prediction)
    #print("RF Score : ",round(rfc.score(X_train, Y_train) * 100, 2))
    #Find Importance of each column
    #importance_Cols = pd.DataFrame({'feature':X_train.columns,'importance':np.round(rfc.feature_importances_,3)})
    #importance_Cols = importance_Cols.sort_values('importance',ascending=False).set_index('feature')
    #print(importance_Cols)
    return Y_prediction


def exportResultData(merged_train_test_data , Y_prediction,fileName):
    passengerID = merged_train_test_data.loc[merged_train_test_data['Type'] == 0].PassengerId
    predictedDF = pd.DataFrame(Y_prediction, columns = ['Survived'])
    
    finalresult = pd.concat([passengerID, predictedDF],axis=1)

    finalresult.to_csv(fileName,index=False)
    return 'File Successfully Exported'


# ---------Main driver Funtion Call all the methods step by step with arguments to run the program.-----------------------------
importedData =  importData(url_train_datafile, url_test_datafile)

filteredData =  processData(importedData)

X_train,Y_train,X_test = divideData(filteredData)

predictedData =  applyModelRF(X_train,Y_train,X_test)

exportResultData(importedData, predictedData, 'submission-2.csv')


# ------------------------------------------------------------------------------------------------------------------------------


'File Successfully Exported'