In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost

In [83]:
testfile = 'test.csv'
trainfile ='train.csv'
resultfile = 'gender_submission.csv'

In [84]:
test = pd.read_csv(testfile)
train = pd.read_csv(trainfile)
result = pd.read_csv(resultfile)

In [85]:
print(test.shape)
print(train.shape)
print(result.shape)

(418, 11)
(891, 12)
(418, 2)


In [86]:
i = train.columns
print(i)

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')


In [87]:
#remove useless columns
train = train.drop(['PassengerId','Name', 'Ticket', 'Cabin', 'Age'],axis=1)
col = train.columns
print(col)

Index([u'Survived', u'Pclass', u'Sex', u'SibSp', u'Parch', u'Fare',
       u'Embarked'],
      dtype='object')


In [88]:
#find which columns contain nan values
def nan_finder(df):
    col = df.columns
    nan_list = []
    for i in col:
        k = df[i].isnull().sum()
        nan_list.append([i,k])
        
    #print(nan_cols)
    for i in nan_list:
        print(i)
nan_finder(train)

['Survived', 0]
['Pclass', 0]
['Sex', 0]
['SibSp', 0]
['Parch', 0]
['Fare', 0]
['Embarked', 2]


In [89]:
#remove those rows where 'Embarked' is 'nan'
train = train[pd.notnull(train['Embarked'])]
print(train.shape)

(889, 7)


In [90]:
#convert gender into one hot
def gender_convert(x):
    if x=='male':
        return 1
    else:
        if x=='female':
            return 0
train['Sex'] = train['Sex'].apply(gender_convert)
train.head()


Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,1,1,0,7.25,S
1,1,1,0,1,0,71.2833,C
2,1,3,0,0,0,7.925,S
3,1,1,0,1,0,53.1,S
4,0,3,1,0,0,8.05,S


In [91]:
#convert embarked into one-hot vector
embarked = pd.get_dummies(train['Embarked'])
#print(embarked)
train = train.join(embarked)
train = train.drop('Embarked', axis=1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,0,3,1,1,0,7.25,0,0,1
1,1,1,0,1,0,71.2833,1,0,0
2,1,3,0,0,0,7.925,0,0,1
3,1,1,0,1,0,53.1,0,0,1
4,0,3,1,0,0,8.05,0,0,1


In [92]:
#replace nan in 'Age' by mean value
#train['Age'].fillna((train['Age'].mean()), inplace=True)

#check for 'nan' in training data
nan_finder(train)

['Survived', 0]
['Pclass', 0]
['Sex', 0]
['SibSp', 0]
['Parch', 0]
['Fare', 0]
['C', 0]
['Q', 0]
['S', 0]


In [93]:
def normalizer(df, key):
    xmean = df[key].mean()
    xmin = df[key].min()
    xmax = df[key].max()
    
    df[key] = (df[key] - xmean)/(xmax - xmin)
    return df[key]

#train['Age'] = normalizer(train, 'Age')
train['Fare'] = normalizer(train, 'Fare')
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,0,3,1,1,0,-0.048497,0,0,1
1,1,1,0,1,0,0.076487,1,0,0
2,1,3,0,0,0,-0.04718,0,0,1
3,1,1,0,1,0,0.040996,0,0,1
4,0,3,1,0,0,-0.046936,0,0,1


In [94]:
#join result with test for combined preprocessing
#test = pd.merge(test, result, how='outer')
#test.head()

In [95]:
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Age'], axis=1)
test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,3,male,0,0,7.8292,Q
1,3,female,1,0,7.0,S
2,2,male,0,0,9.6875,Q
3,3,male,0,0,8.6625,S
4,3,female,1,1,12.2875,S


In [96]:
nan_finder(test)

['Pclass', 0]
['Sex', 0]
['SibSp', 0]
['Parch', 0]
['Fare', 1]
['Embarked', 0]


In [97]:
#test['Age'].fillna((test['Age'].mean()), inplace=True)
test['Fare'].fillna((test['Fare'].mean()), inplace=True)
#test = test[pd.notnull(test['Fare'])]
nan_finder(test)

['Pclass', 0]
['Sex', 0]
['SibSp', 0]
['Parch', 0]
['Fare', 0]
['Embarked', 0]


In [98]:
test['Sex'] = test['Sex'].apply(gender_convert)
test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,3,1,0,0,7.8292,Q
1,3,0,1,0,7.0,S
2,2,1,0,0,9.6875,Q
3,3,1,0,0,8.6625,S
4,3,0,1,1,12.2875,S


In [99]:
#convert embarked into one-hot vector
embarked = pd.get_dummies(test['Embarked'])
#print(embarked)

test = test.join(embarked)
test = test.drop('Embarked', axis=1)
test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,3,1,0,0,7.8292,0,1,0
1,3,0,1,0,7.0,0,0,1
2,2,1,0,0,9.6875,0,1,0
3,3,1,0,0,8.6625,0,0,1
4,3,0,1,1,12.2875,0,0,1


In [100]:
#test['Age'] = normalizer(test,'Age')
test['Fare'] = normalizer(test,'Fare')
test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,3,1,0,0,-0.054258,0,1,0
1,3,0,1,0,-0.055877,0,0,1
2,2,1,0,0,-0.050631,0,1,0
3,3,1,0,0,-0.052632,0,0,1
4,3,0,1,1,-0.045556,0,0,1


In [101]:
y_train = train['Survived'].values
y_train = y_train.reshape(y_train.shape[0],1)
y_train.shape

(889, 1)

In [102]:
#y_test = test['Survived'].values
#y_test = y_test.reshape(y_test.shape[0],1)
#y_test.shape

In [103]:
test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,3,1,0,0,-0.054258,0,1,0
1,3,0,1,0,-0.055877,0,0,1
2,2,1,0,0,-0.050631,0,1,0
3,3,1,0,0,-0.052632,0,0,1
4,3,0,1,1,-0.045556,0,0,1


In [104]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,0,3,1,1,0,-0.048497,0,0,1
1,1,1,0,1,0,0.076487,1,0,0
2,1,3,0,0,0,-0.04718,0,0,1
3,1,1,0,1,0,0.040996,0,0,1
4,0,3,1,0,0,-0.046936,0,0,1


In [105]:
#test = test.drop('Survived', axis=1)
train = train.drop('Survived', axis=1)

In [106]:
test.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,3,1,0,0,-0.054258,0,1,0
1,3,0,1,0,-0.055877,0,0,1
2,2,1,0,0,-0.050631,0,1,0
3,3,1,0,0,-0.052632,0,0,1
4,3,0,1,1,-0.045556,0,0,1


In [107]:
train.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,C,Q,S
0,3,1,1,0,-0.048497,0,0,1
1,1,0,1,0,0.076487,1,0,0
2,3,0,0,0,-0.04718,0,0,1
3,1,0,1,0,0.040996,0,0,1
4,3,1,0,0,-0.046936,0,0,1


In [108]:
x_train = train.values
x_test = test.values

In [109]:
print("Train Input Size: ",train.shape)
print("Test Input Size: ",test.shape)

('Train Input Size: ', (889, 8))
('Test Input Size: ', (418, 8))


In [110]:
print("Train Output Size: ",y_train.shape)
#print("Test Output Size: ",y_test.shape)


('Train Output Size: ', (889, 1))


In [111]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [112]:
model = XGBClassifier()
y_train = y_train.ravel()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [113]:
#y_test = y_test.ravel()
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

In [114]:
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [115]:
len(predictions)
pred = np.array(predictions)
pred.shape

(418,)

In [116]:
ans = pd.read_csv('gender_submission.csv')
ans['Survived'] = pred

In [117]:
ans.to_csv('ans4.csv', sep=',')

In [118]:
!ls


ans1.csv  ans4.csv  gender_submission.csv  titanic2.ipynb  titanic.ipynb
ans2.csv  ans5.csv  Pandas_Tut.ipynb	   titanic3.ipynb  train.csv
ans3.csv  ans6.csv  test.csv		   titanic4.ipynb


In [119]:
pred

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0

In [120]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [121]:
model2 = SVC()
model2.fit(x_train, y_train)
y_pred = model2.predict(x_test)
predictions = [round(value) for value in y_pred]
pred = np.array(predictions)
ans = pd.read_csv('gender_submission.csv')
ans['Survived'] = pred
ans.to_csv('ans5.csv', sep=',')

In [122]:
model3 = LogisticRegression()
model3.fit(x_train, y_train)
y_pred = model3.predict(x_test)
predictions = [round(value) for value in y_pred]
pred = np.array(predictions)
ans = pd.read_csv('gender_submission.csv')
ans['Survived'] = pred
ans.to_csv('ans6.csv', sep=',')

In [123]:
from sklearn.ensemble import RandomForestClassifier
model4 = RandomForestClassifier(n_estimators=10)
model4.fit(x_train, y_train)
y_pred = model4.predict(x_test)
predictions = [round(value) for value in y_pred]
pred = np.array(predictions)
ans = pd.read_csv('gender_submission.csv')
ans['Survived'] = pred
ans.to_csv('ans7.csv', sep=',')