In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost



In [2]:
testfile = 'test.csv'
trainfile ='train.csv'
resultfile = 'gender_submission.csv'

In [3]:
test = pd.read_csv(testfile)
train = pd.read_csv(trainfile)
result = pd.read_csv(resultfile)

In [4]:
print(test.shape)
print(train.shape)
print(result.shape)

(418, 11)
(891, 12)
(418, 2)


In [5]:
i = train.columns
print(i)

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')


In [6]:
#remove useless columns
train = train.drop(['PassengerId','Name', 'Ticket', 'Cabin'],axis=1)
col = train.columns
print(col)

Index([u'Survived', u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare',
       u'Embarked'],
      dtype='object')


In [7]:
#find which columns contain nan values
def nan_finder(df):
    col = df.columns
    nan_list = []
    for i in col:
        k = df[i].isnull().sum()
        nan_list.append([i,k])
        
    #print(nan_cols)
    for i in nan_list:
        print(i)
nan_finder(train)

['Survived', 0]
['Pclass', 0]
['Sex', 0]
['Age', 177]
['SibSp', 0]
['Parch', 0]
['Fare', 0]
['Embarked', 2]


In [8]:
#remove those rows where 'Embarked' is 'nan'
train = train[pd.notnull(train['Embarked'])]
print(train.shape)

(889, 8)


In [9]:
#convert gender into one hot
def gender_convert(x):
    if x=='male':
        return 1
    else:
        if x=='female':
            return 0
train['Sex'] = train['Sex'].apply(gender_convert)
train.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.925,S
3,1,1,0,35.0,1,0,53.1,S
4,0,3,1,35.0,0,0,8.05,S


In [10]:
#convert embarked into one-hot vector
embarked = pd.get_dummies(train['Embarked'])
#print(embarked)
train = train.join(embarked)
train = train.drop('Embarked', axis=1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [11]:
#replace nan in 'Age' by mean value
train['Age'].fillna((train['Age'].mean()), inplace=True)

#check for 'nan' in training data
nan_finder(train)

['Survived', 0]
['Pclass', 0]
['Sex', 0]
['Age', 0]
['SibSp', 0]
['Parch', 0]
['Fare', 0]
['C', 0]
['Q', 0]
['S', 0]


In [12]:
def normalizer(df, key):
    xmean = df[key].mean()
    xmin = df[key].min()
    xmax = df[key].max()
    
    df[key] = (df[key] - xmean)/(xmax - xmin)
    return df[key]

train['Age'] = normalizer(train, 'Age')
train['Fare'] = normalizer(train, 'Fare')
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,1,-0.09603,1,0,-0.048497,0,0,1
1,1,1,0,0.105025,1,0,0.076487,1,0,0
2,1,3,0,-0.045766,0,0,-0.04718,0,0,1
3,1,1,0,0.067327,1,0,0.040996,0,0,1
4,0,3,1,0.067327,0,0,-0.046936,0,0,1


In [13]:
#join result with test for combined preprocessing
#test = pd.merge(test, result, how='outer')
#test.head()

In [14]:
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [15]:
nan_finder(test)

['Pclass', 0]
['Sex', 0]
['Age', 86]
['SibSp', 0]
['Parch', 0]
['Fare', 1]
['Embarked', 0]


In [16]:
test['Age'].fillna((test['Age'].mean()), inplace=True)
test['Fare'].fillna((test['Fare'].mean()), inplace=True)
#test = test[pd.notnull(test['Fare'])]
nan_finder(test)

['Pclass', 0]
['Sex', 0]
['Age', 0]
['SibSp', 0]
['Parch', 0]
['Fare', 0]
['Embarked', 0]


In [17]:
test['Sex'] = test['Sex'].apply(gender_convert)
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,Q
1,3,0,47.0,1,0,7.0,S
2,2,1,62.0,0,0,9.6875,Q
3,3,1,27.0,0,0,8.6625,S
4,3,0,22.0,1,1,12.2875,S


In [18]:
#convert embarked into one-hot vector
embarked = pd.get_dummies(test['Embarked'])
#print(embarked)

test = test.join(embarked)
test = test.drop('Embarked', axis=1)
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [19]:
test['Age'] = normalizer(test,'Age')
test['Fare'] = normalizer(test,'Fare')
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,1,0.055749,0,0,-0.054258,0,1,0
1,3,0,0.220591,1,0,-0.055877,0,0,1
2,2,1,0.418402,0,0,-0.050631,0,1,0
3,3,1,-0.043157,0,0,-0.052632,0,0,1
4,3,0,-0.109094,1,1,-0.045556,0,0,1


In [20]:
y_train = train['Survived'].values
y_train = y_train.reshape(y_train.shape[0],1)
y_train.shape

(889, 1)

In [21]:
#y_test = test['Survived'].values
#y_test = y_test.reshape(y_test.shape[0],1)
#y_test.shape

In [22]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,1,0.055749,0,0,-0.054258,0,1,0
1,3,0,0.220591,1,0,-0.055877,0,0,1
2,2,1,0.418402,0,0,-0.050631,0,1,0
3,3,1,-0.043157,0,0,-0.052632,0,0,1
4,3,0,-0.109094,1,1,-0.045556,0,0,1


In [23]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,1,-0.09603,1,0,-0.048497,0,0,1
1,1,1,0,0.105025,1,0,0.076487,1,0,0
2,1,3,0,-0.045766,0,0,-0.04718,0,0,1
3,1,1,0,0.067327,1,0,0.040996,0,0,1
4,0,3,1,0.067327,0,0,-0.046936,0,0,1


In [24]:
#test = test.drop('Survived', axis=1)
train = train.drop('Survived', axis=1)

In [25]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,1,0.055749,0,0,-0.054258,0,1,0
1,3,0,0.220591,1,0,-0.055877,0,0,1
2,2,1,0.418402,0,0,-0.050631,0,1,0
3,3,1,-0.043157,0,0,-0.052632,0,0,1
4,3,0,-0.109094,1,1,-0.045556,0,0,1


In [26]:
train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,3,1,-0.09603,1,0,-0.048497,0,0,1
1,1,0,0.105025,1,0,0.076487,1,0,0
2,3,0,-0.045766,0,0,-0.04718,0,0,1
3,1,0,0.067327,1,0,0.040996,0,0,1
4,3,1,0.067327,0,0,-0.046936,0,0,1


In [27]:
x_train = train.values
x_test = test.values

In [28]:
print("Train Input Size: ",train.shape)
print("Test Input Size: ",test.shape)

('Train Input Size: ', (889, 9))
('Test Input Size: ', (418, 9))


In [29]:
print("Train Output Size: ",y_train.shape)
#print("Test Output Size: ",y_test.shape)


('Train Output Size: ', (889, 1))


In [30]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [31]:
model = SVC()
y_train = y_train.ravel()
model.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
#y_test = y_test.ravel()
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

In [33]:
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [34]:
len(predictions)
pred = np.array(predictions)
pred.shape

(418,)

In [35]:
ans = pd.read_csv('gender_submission.csv')
ans['Survived'] = pred

In [36]:
ans.to_csv('ans2.csv', sep=',')

In [37]:
!ls


ans1.csv	       Pandas_Tut.ipynb  titanic-Copy1.ipynb  titanic_v3.ipynb
ans2.csv	       test.csv		 titanic.ipynb	      train.csv
gender_submission.csv  titanic2.ipynb	 titanic_v2.ipynb


In [38]:
pred

array([ 0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        1.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
        1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0