In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


df_train = pd.read_csv('all/train.csv')
df_test = pd.read_csv('all/test.csv')

#scaling
from sklearn.preprocessing import StandardScaler
def scaling(data):
    scaler = StandardScaler()
    scaler.fit(data)
    features = scaler.transform(data)
    return features

def manipulate_data(data):
    
    data = data.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
    data = pd.get_dummies(data, columns = ['Pclass','Embarked','Sex'],drop_first=True)

    # fill missingdata
    mean_age = data.Age.median()
    data.Age = data.Age.fillna(mean_age)

    mean_fare = data.Fare.median()
    data.Fare = data.Fare.fillna(mean_fare)

    # feature combining
    data['Family_size'] = data.Parch + data.SibSp
    data = data.drop(['Parch', 'SibSp'], axis=1)
    
    # drop useless and distracting features
    data = data.drop(['Embarked_Q', 'Pclass_2'], axis=1)
    return data


df_train = manipulate_data(df_train)
df_train.info()
df_train.head()

print('\n')
df_test = manipulate_data(df_test)
df_test.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived       891 non-null int64
Age            891 non-null float64
Fare           891 non-null float64
Pclass_3       891 non-null uint8
Embarked_S     891 non-null uint8
Sex_male       891 non-null uint8
Family_size    891 non-null int64
dtypes: float64(2), int64(2), uint8(3)
memory usage: 30.5 KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
Age            418 non-null float64
Fare           418 non-null float64
Pclass_3       418 non-null uint8
Embarked_S     418 non-null uint8
Sex_male       418 non-null uint8
Family_size    418 non-null int64
dtypes: float64(2), int64(1), uint8(3)
memory usage: 11.1 KB


Unnamed: 0,Survived,Age,Fare,Pclass_3,Embarked_S,Sex_male,Family_size
0,0,22.0,7.25,1,1,1,1
1,1,38.0,71.2833,0,0,0,1
2,1,26.0,7.925,1,1,0,0
3,1,35.0,53.1,0,1,0,1
4,0,35.0,8.05,1,1,1,0


In [5]:
value = df_train.values
np.random.shuffle(value)
X = value[:, 1:]
X_sc = scaling(X)
y = value[:, 0]
y_train = value[:800, 0]
y_val = value[800:, 0]

X_train = X[:800]
X_train_sc = scaling(X_train)
X_val = X[800:]
X_val_sc = scaling(X_val)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
print('\n',X)

X_test = df_test.values
X_test_sc = scaling(X_test)
print('\n', X_test.shape)
print('\n',X_test)

(800, 6) (800,) (91, 6) (91,)

 [[44.      7.925   1.      1.      1.      0.    ]
 [28.      7.75    1.      0.      1.      0.    ]
 [28.      7.75    1.      0.      1.      0.    ]
 ...
 [ 2.     21.075   1.      1.      1.      4.    ]
 [38.      7.8958  1.      1.      1.      0.    ]
 [25.      7.25    1.      1.      1.      0.    ]]

 (418, 6)

 [[34.5     7.8292  1.      0.      1.      0.    ]
 [47.      7.      1.      1.      0.      1.    ]
 [62.      9.6875  0.      0.      1.      0.    ]
 ...
 [38.5     7.25    1.      1.      1.      0.    ]
 [27.      8.05    1.      1.      1.      0.    ]
 [27.     22.3583  1.      0.      1.      2.    ]]


In [6]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred_gnb = gnb.predict(X_val)
print(confusion_matrix(y_val, pred_gnb))
print(classification_report(y_val, pred_gnb))
print('validation score: ', accuracy_score(y_val, pred_gnb))


[[49  3]
 [15 24]]
             precision    recall  f1-score   support

        0.0       0.77      0.94      0.84        52
        1.0       0.89      0.62      0.73        39

avg / total       0.82      0.80      0.79        91

validation score:  0.8021978021978022


In [7]:
#logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
pred_logreg = logreg.predict(X_val)
print(confusion_matrix(y_val, pred_logreg))
print(classification_report(y_val, pred_logreg))
print('validation score: ', accuracy_score(y_val, pred_logreg))

# retrain with all data and output
logreg.fit(X, y)
predict_y = logreg.predict(X_test)

result = pd.Series(predict_y, np.arange(418)+ 892, dtype=int, name='Survived')
result.to_csv('log_result.csv', index_label='PassengerId',header=True)


[[50  2]
 [16 23]]
             precision    recall  f1-score   support

        0.0       0.76      0.96      0.85        52
        1.0       0.92      0.59      0.72        39

avg / total       0.83      0.80      0.79        91

validation score:  0.8021978021978022


In [8]:
# svm
from sklearn.svm import SVC
svc = SVC(C = 10)#, gamma = 0.01, probability=True)
svc.fit(X_train_sc, y_train)
pred_svc = svc.predict(X_val_sc)
print(confusion_matrix(y_val, pred_svc))
print(classification_report(y_val, pred_svc))
print('validation score: ', accuracy_score(y_val, pred_svc))

# retrain with all data and output
svc.fit(X_sc, y)
predict_y = svc.predict(X_test_sc)
print(svc.n_support_)
result = pd.Series(predict_y, np.arange(418)+ 892, dtype=int, name='Survived')
result.to_csv('svc_result.csv', index_label='PassengerId',header=True)


[[49  3]
 [17 22]]
             precision    recall  f1-score   support

        0.0       0.74      0.94      0.83        52
        1.0       0.88      0.56      0.69        39

avg / total       0.80      0.78      0.77        91

validation score:  0.7802197802197802
[186 178]


In [11]:
# RF
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=5)

rfc.fit(X_train, y_train)
print(df_train.drop('Survived', axis=1).columns)
print(rfc.feature_importances_)

score_val = rfc.score(X_val, y_val)
print(confusion_matrix(y_val, pred_svc))
print(classification_report(y_val, pred_svc))
print('validation score: ', score_val)

# retrain with all data and output
rfc.fit(X, y)
predict_y = rfc.predict(X_test)

result = pd.Series(predict_y, np.arange(418)+ 892, dtype=int, name='Survived')
result.to_csv('RF_result.csv', index_label='PassengerId',header=True)


Index(['Age', 'Fare', 'Pclass_3', 'Embarked_S', 'Sex_male', 'Family_size'], dtype='object')
[0.11288848 0.18903907 0.131399   0.02521377 0.46223414 0.07922554]
[[49  3]
 [17 22]]
             precision    recall  f1-score   support

        0.0       0.74      0.94      0.83        52
        1.0       0.88      0.56      0.69        39

avg / total       0.80      0.78      0.77        91

validation score:  0.7912087912087912


In [10]:
from sklearn.model_selection import cross_val_score
scores_svc = cross_val_score(svc, X_sc, y, cv=10, scoring='accuracy')
print('\nsfc:\n', scores_svc)
print(scores_svc.mean())
scores_rfc = cross_val_score(rfc, X, y, cv=10, scoring='accuracy')
print('\nrfc:\n', scores_rfc)
print(scores_rfc.mean())


sfc:
 [0.81111111 0.83333333 0.79775281 0.8988764  0.85393258 0.84269663
 0.87640449 0.75280899 0.84269663 0.80681818]
0.8316431165588469

rfc:
 [0.85555556 0.81111111 0.80898876 0.91011236 0.80898876 0.84269663
 0.86516854 0.78651685 0.84269663 0.80681818]
0.833865338781069
