In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import csv as csv

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
np.random.seed(0)

In [3]:
train_df=pd.read_csv("train.csv")
test_df=pd.read_csv("test.csv")

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
#Dropping some columns which I am assuming is useless for prediction

train_df=train_df.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
test_df=test_df.drop(['Name','Ticket','Cabin'],axis=1)

In [8]:
#replacing SEX to int mapping

train_df['Sex']=train_df['Sex'].map({'male':1,'female':0}).astype(int)
test_df['Sex']=test_df['Sex'].map({'male':1,'female':0}).astype(int)

In [9]:
#filling Embarked and mapping 
print(train_df.describe(include=['O']))

train_df['Embarked']=train_df['Embarked'].fillna("S")

print('-'*20)
print(train_df.describe(include=['O']))

train_df['Embarked']=train_df['Embarked'].map({'C':0,'Q':1,'S':2}).astype(int)
test_df['Embarked']=test_df['Embarked'].map({'C':0,'Q':1,'S':2}).astype(int)

       Embarked
count       889
unique        3
top           S
freq        644
--------------------
       Embarked
count       891
unique        3
top           S
freq        646


In [10]:
#filling Age with median age

median_age=train_df['Age'].dropna().median()
train_df['Age']=train_df['Age'].fillna(median_age)
test_df['Age']=test_df['Age'].fillna(median_age)

In [11]:
#filling Fare with median fare : only in test_df

median_fare=test_df['Fare'].dropna().median()
test_df['Fare']=test_df['Fare'].fillna(median_fare)

# Models  

In [12]:
train_df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [13]:
test_df.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [14]:
X_train=train_df[[ 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']]
Y_train=train_df['Survived']

X_test=test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']]

#index
idx=test_df['PassengerId']

X_train.shape , Y_train.shape , X_test.shape , idx.shape

((891, 7), (891,), (418, 7), (418,))

# Logistic Regression

In [15]:
logreg=LogisticRegression()

logreg.fit(X_train,Y_train)
score_logreg=logreg.score(X_train,Y_train)

print("Training Score of Logistic Regression:",score_logreg)

predict_logreg=logreg.predict(X_test)

Training Score of Logistic Regression: 0.803591470258


# Random Forest Classifier

In [16]:
rfc = RandomForestClassifier(n_estimators=100)

rfc.fit(X_train, Y_train)
score_rfc = rfc.score(X_train, Y_train)

print("Training Score of  RandomForestClassifier:",score_rfc)

out_rfc = rfc.predict(X_test)

Training Score of  RandomForestClassifier: 0.979797979798


# KNeighborsClassifier

In [17]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, Y_train)
score_knn = knn.score(X_train, Y_train)

print("Training Score of  KNeighborsClassifier:",score_knn)

out_knn = knn.predict(X_test)

Training Score of  KNeighborsClassifier: 0.809203142536


# SVM

In [18]:
svc = SVC()

svc.fit(X_train, Y_train)
score_svc = svc.score(X_train, Y_train)

print("Training Score of  SVM:",score_svc)


out_svc = svc.predict(X_test)  

Training Score of  SVM: 0.897867564534


# Submission

In [19]:
submission=pd.DataFrame({"PassengerId":idx,"Survived":out_rfc})

submission.to_csv('newsub.csv', index=False)

#  We have used Random Forest as it is giving best accuracy score.