In [665]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


df_train = pd.read_csv('all/train.csv')
df_test = pd.read_csv('all/test.csv')

#scaling
from sklearn.preprocessing import StandardScaler
def scaling(data):
    scaler = StandardScaler()
    scaler.fit(data)
    features = scaler.transform(data)
    return features

def manipulate_data(data):
    
    data = data.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
    data = pd.get_dummies(data, columns = ['Pclass','Embarked','Sex'],drop_first=True)

    # fill missingdata
    mean_age = data.Age.median()
    data.Age = data.Age.fillna(mean_age)

    mean_fare = data.Fare.median()
    data.Fare = data.Fare.fillna(mean_fare)

    # feature combining
    data['Family_size'] = data.Parch + data.SibSp
    data = data.drop(['Parch', 'SibSp'], axis=1)
    
    # drop useless and distracting features
    #data = data.drop(['Embarked_Q', 'Pclass_2'], axis=1)
    return data


df_train = manipulate_data(df_train)
df_train.info()
df_train.head()

print('\n')
df_test = manipulate_data(df_test)
df_test.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived       891 non-null int64
Age            891 non-null float64
Fare           891 non-null float64
Pclass_2       891 non-null uint8
Pclass_3       891 non-null uint8
Embarked_Q     891 non-null uint8
Embarked_S     891 non-null uint8
Sex_male       891 non-null uint8
Family_size    891 non-null int64
dtypes: float64(2), int64(2), uint8(5)
memory usage: 32.3 KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
Age            418 non-null float64
Fare           418 non-null float64
Pclass_2       418 non-null uint8
Pclass_3       418 non-null uint8
Embarked_Q     418 non-null uint8
Embarked_S     418 non-null uint8
Sex_male       418 non-null uint8
Family_size    418 non-null int64
dtypes: float64(2), int64(1), uint8(5)
memory usage: 11.9 KB


Unnamed: 0,Survived,Age,Fare,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Sex_male,Family_size
0,0,22.0,7.25,0,1,0,1,1,1
1,1,38.0,71.2833,0,0,0,0,0,1
2,1,26.0,7.925,0,1,0,1,0,0
3,1,35.0,53.1,0,0,0,1,0,1
4,0,35.0,8.05,0,1,0,1,1,0


In [666]:
# single-layer stacking
from sklearn.model_selection import KFold
class SingleLayerStacking(object):
    def __init__(self, n_folds, base_models):
        self.n_folds = n_folds
        self.base_models = base_models
        
    def fit(self, X, y, sample_weight=None):
        kf = KFold(n_splits=self.n_folds, shuffle=True)
        for i, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            self.base_models[i].fit(X_train, y_train)
        return self
        
    def predict(self, X):
        S_pred = np.zeros((len(self.base_models), X.shape[0]))
        for i, model in enumerate(self.base_models):
            S_pred[i] = model.predict(X)
        S_pred = S_pred.T.astype(int)
        y_predict = np.zeros(S_pred.shape[0])
        for example in range(S_pred.shape[0]):
            y_predict[example] = np.argmax(np.bincount(S_pred[example]))
        return y_predict

In [667]:
value = df_train.values
np.random.shuffle(value)
X = value[:, 1:]
X_sc = scaling(X)
y = value[:, 0]
y_train = value[:800, 0]
y_val = value[800:, 0]

X_train = X[:800]
X_train_sc = scaling(X_train)
X_val = X[800:]
X_val_sc = scaling(X_val)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
print('\n',X)

X_test = df_test.values
X_test_sc = scaling(X_test)
print('\n', X_test.shape)
print('\n',X_test)

(800, 8) (800,) (91, 8) (91,)

 [[ 48.      34.375    0.     ...   1.       0.       4.    ]
 [ 28.       7.75     0.     ...   0.       1.       0.    ]
 [ 28.       7.8292   0.     ...   0.       1.       0.    ]
 ...
 [ 30.      12.35     1.     ...   0.       0.       0.    ]
 [ 28.       7.8958   0.     ...   1.       1.       0.    ]
 [ 17.     110.8833   0.     ...   0.       1.       2.    ]]

 (418, 8)

 [[34.5     7.8292  0.     ...  0.      1.      0.    ]
 [47.      7.      0.     ...  1.      0.      1.    ]
 [62.      9.6875  1.     ...  0.      1.      0.    ]
 ...
 [38.5     7.25    0.     ...  1.      1.      0.    ]
 [27.      8.05    0.     ...  1.      1.      0.    ]
 [27.     22.3583  0.     ...  0.      1.      2.    ]]


In [668]:
svc_1 = SVC(20)
svc_2 = SVC(20)
rfc_1 = RandomForestClassifier(n_estimators=100, max_depth=6)
rfc_2 = RandomForestClassifier(n_estimators=100, max_depth=7)
rfc_3 = RandomForestClassifier(n_estimators=100, max_depth=8)
stacking = SingleLayerStacking(n_folds=5, base_models=[svc_1, svc_2, rfc_1, rfc_2, rfc_3])



In [669]:
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=5, shuffle=True)
scores_stacking = np.zeros(5)
for i, (train_index, test_index) in enumerate(kf.split(X_sc)):
    X_train_i, X_test_i = X_sc[train_index], X_sc[test_index]
    y_train_i, y_test_i = y[train_index], y[test_index]
    stacking.fit(X_train_i, y_train_i)
    scores_stacking[i] = accuracy_score(y_test_i, stacking.predict(X_test_i))
print('\nstacking:\n', scores_stacking)
print(scores_stacking.mean())

scores_svc_1 = cross_val_score(svc_1, X_sc, y, cv=5, scoring='accuracy')
print('\nsvc_1:\n', scores_svc_1)
print(scores_svc_1.mean())

scores_svc_2 = cross_val_score(svc_2, X_sc, y, cv=5, scoring='accuracy')
print('\nsvc_2:\n', scores_svc_2)
print(scores_svc_2.mean())

scores_rfc_1 = cross_val_score(rfc_1, X, y, cv=5, scoring='accuracy')
print('\nrfc_1:\n', scores_rfc_1)
print(scores_rfc_1.mean())

scores_rfc_2 = cross_val_score(rfc_2, X, y, cv=5, scoring='accuracy')
print('\nrfc_2:\n', scores_rfc_2)
print(scores_rfc_2.mean())

scores_rfc_3 = cross_val_score(rfc_3, X, y, cv=5, scoring='accuracy')
print('\nrfc_3:\n', scores_rfc_3)
print(scores_rfc_3.mean())


stacking:
 [0.83798883 0.84831461 0.82022472 0.76966292 0.84831461]
0.8249011361496453

svc_1:
 [0.82122905 0.83240223 0.80337079 0.80898876 0.80225989]
0.8136501444967298

svc_2:
 [0.82122905 0.83240223 0.80337079 0.80898876 0.80225989]
0.8136501444967298

rfc_1:
 [0.83240223 0.82681564 0.8258427  0.82022472 0.81355932]
0.8237689229718417

rfc_2:
 [0.8547486  0.83798883 0.83146067 0.83146067 0.83050847]
0.837233450611695

rfc_3:
 [0.84357542 0.82122905 0.8258427  0.81460674 0.79661017]
0.8203728153935032


In [670]:
selective_model = rfc_1
selective_model.fit(X_sc, y)
predict_y = selective_model.predict(X_test_sc)
result = pd.Series(predict_y, np.arange(418)+ 892, dtype=int, name='Survived')
result.to_csv('result.csv', index_label='PassengerId',header=True)

In [671]:
# multi-layer stacking
from sklearn.cross_validation import KFold
class MultiLayerStacking(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    # X为training features, y为training groundtruth，T为测试
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        from sklearn.cross_validation import KFold

        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        # for each layer
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                # y_holdout = y[test_idx]
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]

            S_test[:, i] = S_test_i.mean(1)

        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)[:]
        return y_pred

In [676]:
from sklearn.linear_model import LogisticRegression
svc_3 = SVC(20)
svc_4 = SVC(20)
rfc_4 = RandomForestClassifier(n_estimators=100, max_depth=6)
rfc_5 = RandomForestClassifier(n_estimators=100, max_depth=7)
rfc_6 = RandomForestClassifier(n_estimators=100, max_depth=8)
log_stacker = RandomForestClassifier(n_estimators=100, max_depth=3)
multi_stacking = MultiLayerStacking(n_folds=5, stacker=log_stacker ,base_models=[svc_3, svc_4, rfc_4, rfc_5, rfc_6] )
y_pred = multi_stacking.fit_predict(X_sc, y, X_test_sc)
result = pd.Series(y_pred, np.arange(418)+ 892, dtype=int, name='Survived')
result.to_csv('multi_result.csv', index_label='PassengerId',header=True)
print(y_pred)

[0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1.
 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.

In [677]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)
scores_multi_stacking = np.zeros(5)
for i, (train_index, test_index) in enumerate(kf.split(X_sc)):
    X_train_i, X_test_i = X_sc[train_index], X_sc[test_index]
    y_train_i, y_test_i = y[train_index], y[test_index]
    y_pred_i = multi_stacking.fit_predict(X_train_i, y_train_i, X_test_i)
    scores_multi_stacking[i] = accuracy_score(y_test_i, y_pred_i)
print('\nmulti_stacking:\n', scores_multi_stacking)
print(scores_multi_stacking.mean())


multi_stacking:
 [0.81564246 0.82022472 0.87078652 0.78651685 0.83707865]
0.8260498399347185
