In [99]:
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")

from sklearn.base import clone as skclone
import sklearn.cross_validation as skcv
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

import common

## Index

### prepare the data

In [100]:
# ***************************** load train data
feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale","Ticket-4digit","Ticket-5digit","Ticket-6digit"]
train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")

# ***************************** split the train data into train-set and validation set
Xtrain, Xvalidate, ytrain, yvalidate = skcv.train_test_split(train_df[feature_names], train_df["Survived"], test_size=0.25)

# ***************************** scale train data
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xvalidate_scaled = scaler.transform(Xvalidate)

### fit base estimators

In [101]:
# first element is the method's name
# second element is whether it need scaling
methods = [("lr",True),("svc",True),("knn",True),("rf",False),("gbdt",False)]
# methods = [("lr",True),("svc",True),("knn",True)]

Estimator = collections.namedtuple("Estimator",("estimator","name","need_scale"))
def fit_estimator(name,need_scale,y,X,scaledX):
    temp = common.load_predictor("%s.pkl"%name)
    estimator = skclone(temp)

    if need_scale:
        estimator.fit(scaledX,y)
    else:
        estimator.fit(X,y)

    return Estimator(estimator,name,need_scale)

base_estimators = [ fit_estimator(name,need_scale,ytrain,Xtrain,Xtrain_scaled) for (name,need_scale) in methods]

### generate predictions on validation sets

In [102]:
def predict_features(base_estimators,X,scaledX):
    basepredicts = [ estimator.estimator.predict(scaledX) if estimator.need_scale else estimator.estimator.predict(X) \
        for estimator in base_estimators]
    return pd.DataFrame(np.asarray(basepredicts).T,
                        index = X.index,
                        columns = [estimator.name  for estimator in base_estimators])

# ***************************** fit advanced features to validation target 
validate_basepredicts = predict_features(base_estimators,Xvalidate,Xvalidate_scaled)

In [103]:
validate_basepredicts.head()

Unnamed: 0_level_0,lr,svc,knn
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
751,1,1,1
796,0,0,0
887,0,0,0
423,0,0,1
690,1,1,1


### train ensemble model

In [104]:
lrcv = LogisticRegressionCV(Cs=30,cv=10)
lrcv.fit(validate_basepredicts,yvalidate)

LogisticRegressionCV(Cs=30, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [105]:
lrcv.score(validate_basepredicts,yvalidate)

0.78923766816143492

In [106]:
common.make_coefs_frame(validate_basepredicts.columns,lrcv.coef_.ravel())

Unnamed: 0_level_0,coefs,importance
names,Unnamed: 1_level_1,Unnamed: 2_level_1
knn,0.689688,0.689688
svc,0.531197,0.531197
lr,0.484342,0.484342


In [107]:
basepredict_lr = LogisticRegression(C = lrcv.C_[0])
basepredict_lr.fit(validate_basepredicts,yvalidate)

LogisticRegression(C=0.057361525104486812, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)

In [108]:
basepredict_lr.score(validate_basepredicts,yvalidate)

0.76681614349775784

In [109]:
common.make_coefs_frame(validate_basepredicts.columns,basepredict_lr.coef_.ravel())

Unnamed: 0_level_0,coefs,importance
names,Unnamed: 1_level_1,Unnamed: 2_level_1
knn,0.53962,0.53962
svc,0.422398,0.422398
lr,0.368841,0.368841


### test

In [110]:
test_df = pd.read_csv("test_processed.csv",index_col="PassengerId")
Xtest = test_df[feature_names]
Xtest_scaled = scaler.transform(Xtest)

In [111]:
test_basepredict = predict_features(base_estimators,Xtest,Xtest_scaled)

In [112]:
final_predictions = basepredict_lr.predict(test_basepredict)
common.make_submission(Xtest.index,final_predictions,"submit_reweight_learners.csv")