In [635]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [636]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Group train and test data in a single dataframe
data = pd.concat([train_df, test_df], ignore_index=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [637]:
data.drop(['Ticket', 'Cabin'], axis=1, inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [638]:
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Mme', 'Lady'], 'Mrs')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace(['Jonkheer', 'Sir', 'Capt', 'Col', 'Don', 'Major', 'Rev'], 'Mr')
data['Title'] = data['Title'].replace(['Dr', 'Dona'], 'Rare')
data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.158192
3,Mrs,0.796875
4,Rare,0.428571


In [639]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
data['Title'] = data['Title'].map(title_mapping)
data['Title'] = data['Title'].fillna(0)
data.drop(['Name', 'PassengerId'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0.0,3,male,22.0,1,0,7.25,S,1
1,1.0,1,female,38.0,1,0,71.2833,C,3
2,1.0,3,female,26.0,0,0,7.925,S,2
3,1.0,1,female,35.0,1,0,53.1,S,3
4,0.0,3,male,35.0,0,0,8.05,S,1


In [640]:
data['Sex'] = data['Sex'].map({'female':1, 'male':0}).astype(int)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0.0,3,0,22.0,1,0,7.25,S,1
1,1.0,1,1,38.0,1,0,71.2833,C,3
2,1.0,3,1,26.0,0,0,7.925,S,2
3,1.0,1,1,35.0,1,0,53.1,S,3
4,0.0,3,0,35.0,0,0,8.05,S,1


In [641]:
guess_ages = np.zeros((2,3))
for i in range(0, 2):
	for j in range(0, 3):
		guess_df = data[(data['Sex'] == i) & (data['Pclass'] == j+1)]['Age'].dropna()
		age_guess = guess_df.median()
		guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

for i in range(0, 2):
	for j in range(0, 3):
		data.loc[(data['Age'].isnull()) & (data.Sex == i) & (data.Pclass == j+1),'Age'] = guess_ages[i,j]

data['Age'] = data['Age'].astype(int)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0.0,3,0,22,1,0,7.25,S,1
1,1.0,1,1,38,1,0,71.2833,C,3
2,1.0,3,1,26,0,0,7.925,S,2
3,1.0,1,1,35,1,0,53.1,S,3
4,0.0,3,0,35,0,0,8.05,S,1


In [642]:
data['AgeBand'] = pd.cut(data['Age'], 5)
data.loc[ data['Age'] <= 16, 'Age'] = 0
data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
data.loc[ data['Age'] > 64, 'Age']
data.drop(['AgeBand'], axis=1, inplace=True)
data['Age*Class'] = data['Age'] * data['Pclass']
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age*Class
0,0.0,3,0,1,1,0,7.25,S,1,3
1,1.0,1,1,2,1,0,71.2833,C,3,2
2,1.0,3,1,1,0,0,7.925,S,2,3
3,1.0,1,1,2,1,0,53.1,S,3,2
4,0.0,3,0,2,0,0,8.05,S,1,6


In [643]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['isAlone'] = 0
data.loc[data['FamilySize'] == 1, 'isAlone'] = 1
data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Age*Class,isAlone
0,0.0,3,0,1,7.25,S,1,3,0
1,1.0,1,1,2,71.2833,C,3,2,0
2,1.0,3,1,1,7.925,S,2,3,1
3,1.0,1,1,2,53.1,S,3,2,0
4,0.0,3,0,2,8.05,S,1,6,1


In [644]:
data['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
data['FareBand'] = pd.qcut(train_df['Fare'], 4)

data.loc[data['Fare'] <= 7.91, 'Fare'] = 0
data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare'] = 2
data.loc[ data['Fare'] > 31, 'Fare'] = 3
data['Fare'] = data['Fare'].astype(int)
data.drop(['FareBand'], axis=1, inplace=True)

data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Age*Class,isAlone
0,0.0,3,0,1,0,S,1,3,0
1,1.0,1,1,2,3,C,3,2,0
2,1.0,3,1,1,1,S,2,3,1
3,1.0,1,1,2,3,S,3,2,0
4,0.0,3,0,2,1,S,1,6,1


In [645]:
freq_port = data['Embarked'].dropna().mode()[0]
data['Embarked'].fillna(freq_port, inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Age*Class,isAlone
0,0.0,3,0,1,0,0,1,3,0
1,1.0,1,1,2,3,1,3,2,0
2,1.0,3,1,1,1,0,2,3,1
3,1.0,1,1,2,3,0,3,2,0
4,0.0,3,0,2,1,0,1,6,1


In [646]:
# Split data into train and test
train = data[:train_df.shape[0]]
test = data[train_df.shape[0]:]
test.drop(['Survived'], axis=1, inplace=True)

X_train = train.drop(['Survived'], axis=1)
Y_train = train['Survived']
X_test = test.copy()
X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['Survived'], axis=1, inplace=True)


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Age*Class,isAlone
891,3,0,2,0,2,1,6,1
892,3,1,2,0,0,3,6,0
893,2,0,3,1,2,1,6,1
894,3,0,1,1,0,1,3,1
895,3,1,1,1,0,3,3,0


In [647]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

coeff_df = pd.DataFrame(train.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Sex,1.973782
5,Title,0.672752
4,Embarked,0.280248
7,isAlone,0.275757
2,Age,0.252792
3,Fare,-0.128887
6,Age*Class,-0.271173
0,Pclass,-0.887629


In [648]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
model.score(X_train, Y_train)
acc_model = round(model.score(X_train, Y_train) * 100, 2)
acc_model

86.53

In [649]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission['Survived'] = submission['Survived'].astype(int)
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
