In [18]:
%matplotlib inline
import numpy as np
import pandas as pd
import re as re

train = pd.read_csv('train.csv', header = 0, dtype={'Age': np.float64})
test  = pd.read_csv('test.csv' , header = 0, dtype={'Age': np.float64})
full_data = [train, test]

print (train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB
None


# Feature Engineering


## Pclass


In [19]:
train[["Pclass", "Survived"]].groupby("Pclass").mean()


Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


## Sex

In [20]:
train[["Sex", "Survived"]].groupby("Sex").mean()


Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


## Parch and SibSp

In [21]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())


   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


In [22]:


for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
print (train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())



   IsAlone  Survived
0        0  0.505650
1        1  0.303538


## Embarked

In [23]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
print (train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())


  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


## Fare
Fill the missing values with median

In [24]:
for dataset in full_data:
#     dataset['Fare'] = dataset['Fare'][dataset['Fare']].astype(int)
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'][train['Fare'].notnull()].median())
    
#     dataset['Fare'] = dataset['Fare'].astype('int64')
# train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# print (train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

train.columns
train.info()
# train

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
FamilySize     891 non-null int64
IsAlone        891 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 104.4+ KB


## Age


In [25]:
train["Age"][train["Age"].isnull()].size

177

There are many missing values for age so we can fill it with random values

In [26]:
for dataset in full_data:
    age_avg = dataset["Age"][dataset["Age"].notnull()].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()

    age_rand_list = np.random.randint(age_avg-age_std, age_avg+age_std, size=age_null_count)
    dataset["Age"][dataset["Age"].isnull()] = age_rand_list
    dataset['Age'] = dataset['Age'].astype(int)
#     dataset["CategoricalAge"] = pd.qcut(dataset["Age"], 5)
# #     print(age_null_count)
# print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
FamilySize     891 non-null int64
IsAlone        891 non-null int64
dtypes: float64(1), int64(8), object(5)
memory usage: 104.4+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


## Name


In [27]:
import re as re
def get_title(name):
	title_search = re.search(' ([A-Za-z]+)\.', name)
	# If the title exists, extract and return it.
	if title_search:
		return title_search.group(1)
	return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

print(pd.crosstab(train['Title'], train['Sex']))
# train

Sex       female  male
Title                 
Capt           0     1
Col            0     2
Countess       1     0
Don            0     1
Dr             1     6
Jonkheer       0     1
Lady           1     0
Major          0     2
Master         0    40
Miss         182     0
Mlle           2     0
Mme            1     0
Mr             0   517
Mrs          125     0
Ms             1     0
Rev            0     6
Sir            0     1


In [28]:
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print (train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
train.info()

    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
FamilySize     891 non-null int64
IsAlone        891 non-null int64
Title          891 non-null object
dtypes: float64(1), int64(8), object(6)
memory usage: 111.4+ KB


# Data cleaning


In [29]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize', 'IsAlone', 'Title'], dtype='object')

In [30]:

for dataset in full_data:
#     # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} )

    #     # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    #     # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )
    
#     # Mapping Fare
#     dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
#     dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
#     dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
#     dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
#     dataset['Fare'] = dataset['Fare']
    
#     # Mapping Age
#     dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
#     dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
#     dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
#     dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
#     dataset.loc[ dataset['Age'] > 64, 'Age']                           = 4

# Feature Selection
train.info()
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp',\
                 'Parch', 'FamilySize']
traindr = train.drop(drop_elements, axis = 1)
# train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)

testdr  = test.drop(drop_elements, axis = 1)

print (traindr.head(10))

traind = traindr.values
testd  = testdr.values
traindr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null int64
FamilySize     891 non-null int64
IsAlone        891 non-null int64
Title          891 non-null int64
dtypes: float64(1), int64(11), object(3)
memory usage: 111.4+ KB
   Survived  Pclass  Sex  Age     Fare  Embarked  IsAlone  Title
0         0       3    1   22   7.2500         0        0      1
1         1       1    0   38  71.2833         1        0      3
2         1       3    0   26   7.9250         0        1      2
3         1       1    0   35  53.1000         0       

In [31]:
# train
X = traindr[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'IsAlone', 'Title']].values
y = traindr['Survived'].values

from sklearn.preprocessing import StandardScaler
ss = StandardScaler(copy=False)
ss.fit_transform(X)


array([[ 0.82737724,  0.73769513, -0.56091307, ..., -0.56883712,
        -1.2316449 , -0.7075504 ],
       [-1.56610693, -1.35557354,  0.62391086, ...,  1.00518113,
        -1.2316449 ,  1.2352151 ],
       [ 0.82737724, -1.35557354, -0.26470709, ..., -0.56883712,
         0.81192233,  0.26383235],
       ..., 
       [ 0.82737724, -1.35557354,  0.47580787, ..., -0.56883712,
        -1.2316449 ,  0.26383235],
       [-1.56610693,  0.73769513, -0.26470709, ...,  1.00518113,
         0.81192233, -0.7075504 ],
       [ 0.82737724,  0.73769513,  0.17960189, ...,  2.57919938,
         0.81192233, -0.7075504 ]])

In [32]:

from sklearn.cross_validation import train_test_split
[X_train, X_cv, y_train, y_cv] = train_test_split(X, y, test_size=0.0, random_state=23)
X_test = testdr[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'IsAlone', 'Title']].values

ss.transform(X_test)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
out = pd.DataFrame(data=X_test)
out.to_csv(path_or_buf='testout.csv', header=None, index=False)

(891, 7)
(891,)
(418, 7)


In [33]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# svc = SVC(C=5, kernel='rbf',  coef0=1, shrinking=True, probability=True, tol=0.0001, cache_size=200, class_weight=None, verbose=True, max_iter=-1, random_state=None)
svc = SVC(probability=True, C=1, kernel = 'poly', degree=4)
svc.fit(X_train, y_train)
y_pred_train = svc.predict(X_train)
y_pred_cv = svc.predict(X_cv)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_cv, y_pred_cv))

ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required.

In [None]:
print(X_test)
# y_pred_teat = svc.predict(X_test)
# stest  = pd.read_csv('test.csv' , header = 0, dtype={'Age': np.float64})
# s = np.vstack((stest['PassengerId'].values, y_pred_teat))
# s

# submit = pd.DataFrame(data =s.transpose(), columns=("PassengerId", "Survived"))
# submit
# submit.to_csv(path_or_buf='svcstdpoly.csv', index=False)