# Library Import

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Datasets Input

In [24]:
train = pd.read_csv('../titanic/train.csv')
test = pd.read_csv('../titanic/test.csv')
x_train = train.drop('Survived', axis=1)
y_train = train['Survived']
x_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Cleaning the Dataset

In [25]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
encoder = LabelEncoder()

def clean_data(df):
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    for col in ['Sex', 'Embarked']:
        df[col] = encoder.fit_transform(df[col])
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df

In [26]:
x_train = clean_data(x_train)
x_test = clean_data(test)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.0,1.0,22.0,1.0,0.0,7.25,2.0
1,1.0,0.0,38.0,1.0,0.0,71.2833,0.0
2,3.0,0.0,26.0,0.0,0.0,7.925,2.0
3,1.0,0.0,35.0,1.0,0.0,53.1,2.0
4,3.0,1.0,35.0,0.0,0.0,8.05,2.0


# Training The Model

In [27]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

# Checking Model Accuracy

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)
Y_pred = classifier.predict(X_test)
results = np.column_stack((Y_pred, Y_test.values))
results = pd.DataFrame(results, columns=['Predicted', 'Actual'])
print(results.head())
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

   Predicted  Actual
0          0       0
1          0       0
2          0       0
3          1       1
4          1       1


0.9832402234636871

# Predicting The Results

In [29]:
y_pred = classifier.predict(x_test)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_pred})
submission.to_csv('../titanic/submission.csv', index=False)