In [1]:
import pandas as pd
import numpy as np
import csv as csv

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC , LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import shuffle

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

In [2]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_dataset.shape

(891, 12)

In [4]:
print('id is unique.') if train_dataset.PassengerId.nunique() == train_dataset.shape[0] else print('oops')
print('train and test sets are distinct') if len(np.intersect1d(train_dataset.PassengerId.values , test_dataset.PassengerId.values)) ==0 else print('oops')

datasetHasNan = False

if train_dataset.count().min() == train_dataset.shape[0] and test_dataset.count().min() == test_dataset.shape[0]:
    print('everyhting good, we dont need to worry')
else:
    datasetHasNan = True
    print('oops we have nan .')

id is unique.
train and test sets are distinct
oops we have nan .


In [5]:
dtype_df = train_dataset.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()

Unnamed: 0,Column Type,Count
0,int64,5
1,float64,2
2,object,5


In [6]:
dtype_df

Unnamed: 0,Count,Column Type
0,PassengerId,int64
1,Survived,int64
2,Pclass,int64
3,Name,object
4,Sex,object
5,Age,float64
6,SibSp,int64
7,Parch,int64
8,Ticket,object
9,Fare,float64


In [7]:
if datasetHasNan == True:
    nas = pd.concat([train_dataset.isnull().sum() , test_dataset.isnull().sum()] , axis = 1, keys=['Train Datasets', 'Test Datasets'])
    print('nan in the datasets')
    print(nas[nas.sum(axis = 1) > 0])

nan in the datasets
          Train Datasets  Test Datasets
Age                  177           86.0
Fare                   0            1.0
Cabin                687          327.0
Embarked               2            0.0


In [8]:
print(train_dataset[['Pclass' , 'Survived']].groupby(['Pclass'], as_index = False).mean(). sort_values(by =  'Survived', ascending = False))

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [9]:
print(train_dataset[['Sex' , 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by = 'Survived', ascending = False))

      Sex  Survived
0  female  0.742038
1    male  0.188908


In [10]:
#on the basis of number of siblings or spouse in the titanic
print(train_dataset[['SibSp', 'Survived']].groupby(['SibSp'], as_index = False).mean().sort_values(by = 'Survived', ascending = False))

   SibSp  Survived
1      1  0.535885
2      2  0.464286
0      0  0.345395
3      3  0.250000
4      4  0.166667
5      5  0.000000
6      8  0.000000


In [11]:
print(train_dataset[['Parch' , 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by = 'Survived', ascending = False))

   Parch  Survived
3      3  0.600000
1      1  0.550847
2      2  0.500000
0      0  0.343658
5      5  0.200000
4      4  0.000000
6      6  0.000000


## Data Cleaning

In [12]:
train_random_ages = np.random.randint(train_dataset["Age"].mean() - train_dataset["Age"].std(), train_dataset["Age"].mean() + train_dataset["Age"].std(), size = train_dataset["Age"].isnull().sum())

test_random_ages = np.random.randint(test_dataset["Age"].mean() -test_dataset["Age"].std() ,test_dataset["Age"].mean() + test_dataset["Age"].std() , size = test_dataset['Age'].isnull().sum())

In [13]:
train_dataset["Age"][np.isnan(train_dataset["Age"])] = train_random_ages
test_dataset["Age"][np.isnan(test_dataset["Age"])] = test_random_ages

train_dataset['Age'] = train_dataset['Age'].astype(int)
test_dataset['Age'] = test_dataset['Age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
train_dataset["Embarked"].fillna('S', inplace = True)
test_dataset["Embarked"].fillna('S', inplace = True)

In [15]:
test_dataset["Port"] = test_dataset["Embarked"].map({'S':0, 'C':1 , 'Q':2}).astype(int)
train_dataset["Port"] = train_dataset["Embarked"].map({'S':0, 'C':1 , 'Q':2}).astype(int)

del  test_dataset["Embarked"]
del  train_dataset["Embarked"]

In [16]:
test_dataset['Fare'].fillna(test_dataset['Fare'].median(), inplace = True)

In [17]:
test_dataset['Cabin'].isnull().sum()

327

In [18]:
test_dataset.shape

(418, 11)

In [19]:
train_dataset['Has_Cabin'] = train_dataset['Cabin'].apply(lambda x:0 if type(x) == float else 1)
test_dataset['Has_Cabin'] = test_dataset['Cabin'].apply(lambda x:0 if type(x) == float else 1)

In [20]:
full_dataset = [train_dataset , test_dataset]

In [21]:
for dataset in full_dataset:
    dataset['familysize'] = dataset['SibSp'] + dataset['Parch'] + 1


for dataset in full_dataset:
    dataset['isAlone'] = 0
    dataset.loc[dataset['familysize'] == 1, 'isAlone']=1

In [22]:
train_dataset['Title'] = train_dataset.Name.str.extract('([A-Za-z]+)\.' , expand = False)
test_dataset['Title'] = test_dataset.Name.str.extract('([A-Za-z]+)\.' , expand = False)

In [23]:
train_dataset.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Has_Cabin,familysize,isAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,0,0,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,1,1,2,0,Mrs


In [50]:
title_unique = train_dataset.Title.unique()
title_unique

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [25]:
for dataset in full_dataset:
    dataset['Title'] =  dataset['Title'].fillna('X')
    dataset['Title']  = dataset['Title'].replace(['Dona','Don', 'Rev', 'Dr', 'Mme', 'Major', 'Lady', 'Col', 'Capt', 'Countess', 'Jonkheer'] , 'Rare')
    dataset['Title']  = dataset['Title'].replace(['Mlle', 'Ms'] , 'Miss')
    dataset['Title']  = dataset['Title'].replace('Mme' , 'Mrs')

In [26]:
#  test_dataset['Title']  = test_dataset['Title'].replace('nan' , 'Rare')


In [27]:
test_dataset.Title.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], dtype=object)

In [28]:
for dataset in full_dataset:
    dataset['FamilySizeGroup'] = 'Small'
    dataset.loc[dataset['familysize'] == 1 ,  'FamilySizeGroup']= 'Alone'
    dataset.loc[dataset['familysize'] >= 5 ,  'FamilySizeGroup']= 'Big'
    

In [29]:
test_dataset.tail(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Has_Cabin,familysize,isAlone,Title,FamilySizeGroup
408,1300,3,"Riordan, Miss. Johanna Hannah""""",female,18,0,0,334915,7.7208,,2,0,1,1,Miss,Alone
409,1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,0,0,3,0,Miss,Small
410,1302,3,"Naughton, Miss. Hannah",female,38,0,0,365237,7.75,,2,0,1,1,Miss,Alone
411,1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90.0,C78,2,1,2,0,Mrs,Small
412,1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,0,0,1,1,Miss,Alone
413,1305,3,"Spector, Mr. Woolf",male,26,0,0,A.5. 3236,8.05,,0,0,1,1,Mr,Alone
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,1,1,1,1,Rare,Alone
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38,0,0,SOTON/O.Q. 3101262,7.25,,0,0,1,1,Mr,Alone
416,1308,3,"Ware, Mr. Frederick",male,27,0,0,359309,8.05,,0,0,1,1,Mr,Alone
417,1309,3,"Peter, Master. Michael J",male,24,1,1,2668,22.3583,,1,0,3,0,Master,Small


In [30]:
train_dataset[['familysize' , 'Survived']].groupby(['familysize'] , as_index = False).mean()

Unnamed: 0,familysize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


In [31]:
for dataset in full_dataset:
    dataset['Sex'] = dataset['Sex'].map({'female':1 , 'male':0}).astype(int)

In [32]:
for dataset in full_dataset:
    dataset.loc[dataset['Age'] <=14 , 'Age']=0
    dataset.loc[(dataset['Age'] > 14) & (dataset['Age'] <= 32)  , 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48)  , 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64)  , 'Age'] = 3
    dataset.loc[dataset['Age']  > 64 , 'Age']=4

In [33]:
for dataset in full_dataset:
    dataset.loc[dataset['Fare'] <= 7.71 , 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454)  , 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31)  , 'Fare'] = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

In [34]:
train_dataset.FamilySizeGroup.unique()

array(['Small', 'Alone', 'Big'], dtype=object)

In [35]:
title_mapping = {'Mr':1 , 'Mrs': 2, 'Miss':3, 'Master':4, 'Rare':5, 'Sir':6 }
family_mapping = {'Small':0, 'Alone':1, 'Big':2}

for dataset in full_dataset:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['FamilySizeGroup'] = dataset['FamilySizeGroup'].map(family_mapping)

In [36]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Port,Has_Cabin,familysize,isAlone,Title,FamilySizeGroup
0,1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171,0,,0,0,2,0,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,2,1,0,PC 17599,3,C85,1,1,2,0,2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282,1,,0,0,1,1,3,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,2,1,0,113803,3,C123,0,1,2,0,2,0
4,5,0,3,"Allen, Mr. William Henry",0,2,0,0,373450,1,,0,0,1,1,1,1


In [37]:
for dataset in full_dataset:
    dataset['IsRichChild']= 0
    dataset.loc[(dataset['Age'] <= 0) & (dataset['Pclass']==1),  'IsRichChild'] = 1
    dataset.loc[(dataset['Age'] <= 0) & (dataset['Pclass']==2),  'IsRichChild'] = 1

In [38]:
train_dataset.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [39]:
str('B23')[1]

'2'

In [40]:
for data in full_dataset:
    data['Cabin'] =  data['Cabin'].fillna('X')
    data['Cabin'] = data['Cabin'].apply(lambda x: str(x)[0])
    data['Cabin'] = data['Cabin'].replace(['A', 'D', 'E', 'T'], 'M')
    data['Cabin'] = data['Cabin'].replace(['B', 'C'], 'H')
    data['Cabin'] = data['Cabin'].replace(['F', 'G'], 'L')
    data['Cabin'] = data['Cabin'].map({'X': 0, 'L': 1, 'M': 2, 'H': 3}).astype(int) 

In [41]:
del train_dataset['Name']
del test_dataset['Name']

del train_dataset['SibSp']
del test_dataset['SibSp']

del train_dataset['Parch']
del test_dataset['Parch']

del train_dataset['familysize']
del test_dataset['familysize']

del train_dataset['Cabin']
del test_dataset['Cabin']

del train_dataset['Ticket']
del test_dataset['Ticket']

del train_dataset['Port']
del test_dataset['Port']


In [42]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Has_Cabin,isAlone,Title,FamilySizeGroup,IsRichChild
0,1,0,3,0,1,0,0,0,1,0,0
1,2,1,1,1,2,3,1,0,2,0,0
2,3,1,3,1,1,1,0,1,3,1,0
3,4,1,1,1,2,3,1,0,2,0,0
4,5,0,3,0,2,1,0,1,1,1,0


In [43]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Has_Cabin,isAlone,Title,FamilySizeGroup,IsRichChild
0,892,3,0,2,7,0,1,1,1,0
1,893,3,1,2,0,0,0,2,0,0
2,894,2,0,3,1,0,1,1,1,0
3,895,3,0,1,1,0,1,1,1,0
4,896,3,1,1,1,0,0,2,0,0


In [44]:
test_dataset.isnull().any()

PassengerId        False
Pclass             False
Sex                False
Age                False
Fare               False
Has_Cabin          False
isAlone            False
Title              False
FamilySizeGroup    False
IsRichChild        False
dtype: bool

In [45]:
del train_dataset['PassengerId']

X_train = train_dataset.drop("Survived",axis=1)
Y_train = train_dataset["Survived"]
X_test  = test_dataset.drop("PassengerId",axis=1).copy()


In [46]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)

(891, 9)
(891,)
(418, 9)


In [47]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
addpoly = True

In [48]:
if addpoly:
    all_data = pd.concat((X_train , X_test) , ignore_index = True)
    scaler = MinMaxScaler()
    scaler.fit(all_data)
    all_data = scaler.transform(all_data)
    poly = PolynomialFeatures(2)
    all_data=poly.fit_transform(all_data)
    
    X_train =all_data[:train_dataset.shape[0]]
    X_test =all_data[:test_dataset.shape[0]]
    

In [51]:
X_train =all_data[:train_dataset.shape[0]]
X_test =all_data[:test_dataset.shape[0]]

In [52]:
X_train

array([[1.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 1.  , 1.  , ..., 0.25, 0.  , 0.  ],
       ...,
       [1.  , 1.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  , ..., 0.25, 0.  , 0.  ],
       [1.  , 1.  , 0.  , ..., 0.25, 0.  , 0.  ]])

In [53]:
all_data

array([[1.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 0.  , 1.  , ..., 0.  , 0.  , 0.  ],
       [1.  , 1.  , 1.  , ..., 0.25, 0.  , 0.  ],
       ...,
       [1.  , 1.  , 0.  , ..., 0.25, 0.  , 0.  ],
       [1.  , 1.  , 0.  , ..., 0.25, 0.  , 0.  ],
       [1.  , 1.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [63]:
all_data.shape

(1309, 55)

In [59]:
train_dataset.shape

(891, 10)

In [58]:
test_dataset.shape

(418, 10)

In [60]:
X_train.shape

(891, 55)

In [65]:
Y_train.shape

(891,)