In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())
print(test_df.shape)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
(418, 11)


In [6]:
print(train_df.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [7]:
def clean_data(df):
    df.drop(columns=['Cabin', 'PassengerId', 'Ticket'], axis=1,  inplace=True)
    

    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    return df
    
    

In [8]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
train_df = clean_data(train_df)
test_df = clean_data(test_df)

In [10]:
import re

def create_features(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] +  1
    
    #1 for true, 0 for false
    df['IsAlone'] = 1
    
    df['IsAlone'].loc[df['FamilySize'] > 1] = 0
    
    df['Title'] = df['Name'].apply(lambda x: (re.findall(r'(\w+)\.', x) or None,)[0][0])
    
    df['FareBin'] = pd.qcut(df['Fare'], 4)
    
    df['AgeBin'] = pd.cut(df['Age'].astype(int), 5)
    
    return df
    

In [11]:
train_df = create_features(train_df)
test_df = create_features(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
train_df.head()
test_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1,1,Mr,"(-0.001, 7.896]","(30.4, 45.6]"
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,2,0,Mrs,"(-0.001, 7.896]","(45.6, 60.8]"
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1,1,Mr,"(7.896, 14.454]","(60.8, 76.0]"
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1,1,Mr,"(7.896, 14.454]","(15.2, 30.4]"
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3,0,Mrs,"(7.896, 14.454]","(15.2, 30.4]"


In [13]:
min_num = 5

title_names = (train_df['Title'].value_counts() < min_num)


train_df['Title'] = train_df['Title'].apply(lambda x: "Misc" if title_names.loc[x] == True else x)

In [14]:
train_df['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Misc       14
Dr          7
Rev         6
Name: Title, dtype: int64

In [15]:
LE = preprocessing.LabelEncoder()

def create_categorical(df):
    df['Sex_Code'] = LE.fit_transform(df['Sex'])
    
    df['Title_Code'] = LE.fit_transform(df['Title'])
    
    df['Embarked_Code'] = LE.fit_transform(df['Embarked'])
    
    df['AgeBin_Code'] = LE.fit_transform(df['AgeBin'])
    
    df['FareBin_Code'] = LE.fit_transform(df['FareBin'])
    
    return df

In [16]:
train_df = create_categorical(train_df)
test_df = create_categorical(test_df)

In [17]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Fare             891 non-null float64
Embarked         891 non-null object
FamilySize       891 non-null int64
IsAlone          891 non-null int64
Title            891 non-null object
FareBin          891 non-null category
AgeBin           891 non-null category
Sex_Code         891 non-null int32
Title_Code       891 non-null int32
Embarked_Code    891 non-null int32
AgeBin_Code      891 non-null int32
FareBin_Code     891 non-null int32
dtypes: category(2), float64(2), int32(5), int64(6), object(4)
memory usage: 102.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 18 columns):
Pclas

In [18]:
from sklearn import model_selection

X_col = ["Sex_Code", "Pclass", "Title_Code", "Embarked_Code", "Age", "Fare", "SibSp", "Parch"]
Target = ["Survived"]
train_x, val_x, train_y, val_y = model_selection.train_test_split(train_df[X_col], train_df[Target], random_state=0)

In [19]:
print(train_x.shape)
print(val_x.shape)
print(train_y.shape)
print(val_y.shape)

(668, 8)
(223, 8)
(668, 1)
(223, 1)


In [20]:
for x in train_x:
    if train_df[x].dtype != 'float64' :
        print('Survival Correlation by:', x)
        print(train_df[[x, Target[0]]].groupby(x, as_index=False).mean())
        print('-'*10, '\n')
        

Survival Correlation by: Sex_Code
   Sex_Code  Survived
0         0  0.742038
1         1  0.188908
---------- 

Survival Correlation by: Pclass
   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363
---------- 

Survival Correlation by: Title_Code
   Title_Code  Survived
0           0  0.428571
1           1  0.575000
2           2  0.642857
3           3  0.697802
4           4  0.156673
5           5  0.792000
6           6  0.000000
---------- 

Survival Correlation by: Embarked_Code
   Embarked_Code  Survived
0              0  0.553571
1              1  0.389610
2              2  0.339009
---------- 

Survival Correlation by: SibSp
   SibSp  Survived
0      0  0.345395
1      1  0.535885
2      2  0.464286
3      3  0.250000
4      4  0.166667
5      5  0.000000
6      8  0.000000
---------- 

Survival Correlation by: Parch
   Parch  Survived
0      0  0.343658
1      1  0.550847
2      2  0.500000
3      3  0.600000
4      4  0.000000
5      5  0.200000
6

In [21]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process


clf = tree.DecisionTreeClassifier()

clf.fit(train_x, train_y)

clf.score(val_x, val_y)


0.757847533632287

In [22]:
clf = svm.SVC()
clf.fit(train_x, train_y)
clf.score(val_x, val_y)

  y = column_or_1d(y, warn=True)


0.7533632286995515

In [33]:
clf = linear_model.LogisticRegression()

clf.fit(train_x, train_y)

clf.score(val_x, val_y)

  y = column_or_1d(y, warn=True)


0.7847533632286996

In [24]:
clf = naive_bayes.GaussianNB()

clf.fit(train_x, train_y)

clf.score(val_x, val_y)

  y = column_or_1d(y, warn=True)


0.7937219730941704

In [25]:
test_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin,Sex_Code,Title_Code,Embarked_Code,AgeBin_Code,FareBin_Code
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1,1,Mr,"(-0.001, 7.896]","(30.4, 45.6]",1,5,1,2,0
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,2,0,Mrs,"(-0.001, 7.896]","(45.6, 60.8]",0,6,2,3,0
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1,1,Mr,"(7.896, 14.454]","(60.8, 76.0]",1,5,1,4,1
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1,1,Mr,"(7.896, 14.454]","(15.2, 30.4]",1,5,2,1,1
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3,0,Mrs,"(7.896, 14.454]","(15.2, 30.4]",0,6,2,1,1


In [26]:
test_x = test_df[X_col]
test_x.head()

Unnamed: 0,Sex_Code,Pclass,Title_Code,Embarked_Code,Age,Fare,SibSp,Parch
0,1,3,5,1,34.5,7.8292,0,0
1,0,3,6,2,47.0,7.0,1,0
2,1,2,5,1,62.0,9.6875,0,0
3,1,3,5,2,27.0,8.6625,0,0
4,0,3,6,2,22.0,12.2875,1,1


In [34]:
test_pred = clf.predict(test_x)

In [35]:
submission = pd.read_csv("./data/gender_submission.csv")
submission.head(10)
old = submission["Survived"]

In [36]:
submission["Survived"] = test_pred

In [37]:
submission.head(10)
print(submission.columns)

Index(['PassengerId', 'Survived'], dtype='object')


In [38]:
submission.to_csv("./data/submissionLOG.csv", index=False)