# Loading data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/home/denniss/Downloads/train.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# Cleaning

In [3]:
df = df.drop(['Cabin', 'PassengerId', 'Ticket'], axis=1)
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S


In [4]:
df = df.fillna(value={'Age': df['Age'].mean()})

# Feature Engineering

### One-Hot-Encoding

In [5]:
onehot = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, onehot], axis=1) # glues two dataframes together horizontally
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,1


In [6]:
onehot2 = pd.get_dummies(df['Pclass'], prefix='Pclass')
df = pd.concat([df, onehot2], axis=1) # glues two dataframes together horizontally
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,0,1,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,0,1,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,1,0,0,1


### Target Encoding

In [7]:
means = df.groupby('Sex')['Survived'].mean()
means.to_dict()

{'female': 0.7420382165605095, 'male': 0.18890814558058924}

In [8]:
df['sex_target_enc'] = df['Sex'].replace(means.to_dict())
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,sex_target_enc
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,0,1,0,0,1,0.188908
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,0,1,0,0,0.742038
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,1,0,0,1,0.742038


### Binning

Fare

In [9]:
bins = pd.cut(df['Fare'], bins=10) # creates equally wide buckets
binning = pd.get_dummies(bins, prefix='Fare')

In [10]:
df = pd.concat([df, binning], axis=1)

Age

In [11]:
bins2 = pd.cut(df['Age'], bins=10) # creates equally wide buckets
binning2 = pd.get_dummies(bins2, prefix='Age')

In [12]:
df = pd.concat([df, binning2], axis=1)
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,...,"Age_(0.34, 8.378]","Age_(8.378, 16.336]","Age_(16.336, 24.294]","Age_(24.294, 32.252]","Age_(32.252, 40.21]","Age_(40.21, 48.168]","Age_(48.168, 56.126]","Age_(56.126, 64.084]","Age_(64.084, 72.042]","Age_(72.042, 80.0]"
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,...,0,0,1,0,0,0,0,0,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,...,0,0,0,0,1,0,0,0,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,...,0,0,0,1,0,0,0,0,0,0


### Scaling

In [13]:
df['Fare_scaled'] = (df['Fare'] - df['Fare'].min()) / (df['Fare'].max() - df['Fare'].min())
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,...,"Age_(8.378, 16.336]","Age_(16.336, 24.294]","Age_(24.294, 32.252]","Age_(32.252, 40.21]","Age_(40.21, 48.168]","Age_(48.168, 56.126]","Age_(56.126, 64.084]","Age_(64.084, 72.042]","Age_(72.042, 80.0]",Fare_scaled
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,...,0,1,0,0,0,0,0,0,0,0.014151
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,...,0,0,0,1,0,0,0,0,0,0.139136
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,...,0,0,1,0,0,0,0,0,0,0.015469


In [14]:
df['Age_scaled'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,...,"Age_(16.336, 24.294]","Age_(24.294, 32.252]","Age_(32.252, 40.21]","Age_(40.21, 48.168]","Age_(48.168, 56.126]","Age_(56.126, 64.084]","Age_(64.084, 72.042]","Age_(72.042, 80.0]",Fare_scaled,Age_scaled
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,...,1,0,0,0,0,0,0,0,0.014151,0.271174
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,...,0,0,1,0,0,0,0,0,0.139136,0.472229
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,...,0,1,0,0,0,0,0,0,0.015469,0.321438


### Interaction Terms

In [15]:
df['SibSp*Parch'] = df['SibSp'] * df['Parch']

### Checking if data is ready for model

In [16]:
df.isna().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
SibSp                      0
Parch                      0
Fare                       0
Embarked                   2
Embarked_C                 0
Embarked_Q                 0
Embarked_S                 0
Pclass_1                   0
Pclass_2                   0
Pclass_3                   0
sex_target_enc             0
Fare_(-0.512, 51.233]      0
Fare_(51.233, 102.466]     0
Fare_(102.466, 153.699]    0
Fare_(153.699, 204.932]    0
Fare_(204.932, 256.165]    0
Fare_(256.165, 307.398]    0
Fare_(307.398, 358.63]     0
Fare_(358.63, 409.863]     0
Fare_(409.863, 461.096]    0
Fare_(461.096, 512.329]    0
Age_(0.34, 8.378]          0
Age_(8.378, 16.336]        0
Age_(16.336, 24.294]       0
Age_(24.294, 32.252]       0
Age_(32.252, 40.21]        0
Age_(40.21, 48.168]        0
Age_(48.168, 56.126]       0
Age_(56.126, 64.084]       0
Age_(64.084, 7

### Model

In [17]:
new_features = df.columns.values[9:]

In [18]:
X = df[new_features]
y = df["Survived"]

In [19]:
X.shape, y.shape

((891, 30), (891,))

In [20]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1e6, max_iter=1e6)
model.fit(X, y)

LogisticRegression(C=1000000.0, max_iter=1000000.0)

In [21]:
# calculate a metric (accuracy == % of correct predictions)
model.score(X, y)

0.8226711560044894