In [1]:
import os
import time
import json
import datetime as dt
import numpy as np
import pandas as pd
from scipy import stats
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals.joblib import parallel_backend

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from bayes_opt import BayesianOptimization

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)
plt.rcParams['figure.figsize'] = (12,8)

In [3]:
# raw train and test
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

all_test_df = pd.read_csv('clean_data/all_test_df.csv')

train.shape, test.shape, all_test_df.shape

((15120, 54), (565892, 55), (565892, 115))

In [4]:
# w engineered features
poly_train = pd.read_csv('clean_data/train_poly_final.csv')
poly_test = pd.read_csv('clean_data/test_poly_final.csv')
poly_train.shape, poly_test.shape

((15120, 200), (565892, 200))

In [5]:
# Model Parameters
with open('models/lreg_results.json') as f:
    lreg_results = json.load(f)
with open('models/lda_results.json') as f:    
    lda_results = json.load(f)
with open('models/knn_results.json') as f:    
    knn_results = json.load(f)
with open('models/svm_results.json') as f:    
    svm_results = json.load(f)

with open('models/rf_results.json') as f:
    rf_results = json.load(f)
with open('models/et_results.json') as f:    
    et_results = json.load(f)
with open('models/mlp_results.json') as f:    
    mlp_results = json.load(f)
with open('models/lgbm_results.json') as f:    
    lgbm_results = json.load(f)
with open('models/xgb_results.json') as f:    
    xgb_results = json.load(f)

In [6]:
# Reduce in-memory size of pandas dataframe by compressing dtypes
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else: df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [7]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
all_test_df = reduce_mem_usage(all_test_df)

poly_train = reduce_mem_usage(poly_train)
poly_test = reduce_mem_usage(poly_test)


ytrain = train['Cover_Type']

Memory usage of dataframe is 6.23 MB
Memory usage after optimization is: 1.01 MB
Decreased by 83.8%
Memory usage of dataframe is 237.46 MB
Memory usage after optimization is: 38.32 MB
Decreased by 83.9%
Memory usage of dataframe is 496.50 MB
Memory usage after optimization is: 104.70 MB
Decreased by 78.9%
Memory usage of dataframe is 23.07 MB
Memory usage after optimization is: 5.77 MB
Decreased by 75.0%
Memory usage of dataframe is 863.48 MB
Memory usage after optimization is: 212.09 MB
Decreased by 75.4%


### Stacking

I'm not sure the proper way to do this.

 - training all base_estimators on whole dataset.
 - generating probabilities on the same dataset
 - training meta estimators on those probs -whole dataset
 - submitting test predictions



In [8]:
# all_results

all_results = {
    LogisticRegression: lreg_results,
    LinearDiscriminantAnalysis: lda_results,
    KNeighborsClassifier: knn_results,
    SVC: svm_results,
    RandomForestClassifier: rf_results,
    ExtraTreesClassifier: et_results,
    MLPClassifier: mlp_results,
    LGBMClassifier: lgbm_results,
    XGBClassifier: xgb_results
}



In [9]:
mods = [
    l(**{k:v for k,v in p.items() if k!='accuracy'}) for l, p in all_results.items()
]
mods.pop(3) # SVC
# so as to get predict_proba
mods.append(SVC(probability=True, **{k:v for k,v in all_results[SVC].items() if k!='accuracy'}))

In [10]:
# cross_validate meta-estimator

# def cv_meta_est(est, params=None):
#     "Poorly written function"
#     nclasses = ytrain.nunique()
#     params = params or {}

#     scores = []
    
#     # split into train and test folds
#     # large test size because we are going to split that again for meta-estimator
#     cv = StratifiedShuffleSplit(NCV,test_size=0.4, random_state=seed)
#     splits = cv.split(poly_train, ytrain)
    
#     # iterate through folds
#     for train_idxs, test_idxs in splits:
#         all_probs = [] # list of dfs w class probs for given est
#         # generate predict_probas from each estimator
#         for m in mods:
#             name = str(m.__class__).split('.')[-1].strip('>').strip("'")
#             print(name)
#             m.fit(poly_train.iloc[train_idxs], ytrain.iloc[train_idxs])
                
            
#             probs = pd.DataFrame(m.predict_proba(poly_train.iloc[test_idxs]), columns = [name + '_CLASS_' + str(i+1) for i in range(nclasses)])
#             all_probs.append(probs)

#         # this is now our input features into next level models
#         all_prob_df = pd.concat(all_probs, axis=1)
#         # cv meta-estimator on predicted probs for test set
#         meta_cv = StratifiedShuffleSplit(3, test_size=0.2)
#         mn_accuracy = np.mean(cross_val_score(est(**params), all_prob_df, ytrain[test_idxs], cv=meta_cv, scoring='accuracy'))
#         scores.append(mn_accuracy)
#         print(mn_accuracy)
#     return scores
        
# scores = cv_meta_est(LogisticRegression)
# scores    

In [11]:
def gen_meta_inputs(mods):
    "mods: List of Unfitted Paramaterized Estimators"
    nclasses = ytrain.nunique()
    all_probs_train = [] # list of dfs w class probs for given est
    all_probs_test = []
    # generate predict_probas from each estimator
    for m in mods:
        name = str(m.__class__).split('.')[-1].strip('>').strip("'")
        print(name)
        m.fit(poly_train, ytrain)


        probs_train = pd.DataFrame(m.predict_proba(poly_train), columns = [name + '_CLASS_' + str(i+1) for i in range(nclasses)])
        probs_test = pd.DataFrame(m.predict_proba(poly_test), columns = [name + '_CLASS_' + str(i+1) for i in range(nclasses)])
        all_probs_train.append(probs_train)
        all_probs_test.append(probs_test)

    # this is now our input features into next level models
    all_prob_train_df = pd.concat(all_probs_train, axis=1)
    all_prob_test_df = pd.concat(all_probs_test, axis=1)

    return all_prob_train_df, all_prob_test_df

In [None]:
all_probs_train_df, all_probs_test_df = gen_meta_inputs(mods)

In [None]:
all_probs_train_df.to_csv('clean_data/all_probs_train_df.csv', index=False)
all_probs_test_df.set_index(test.Id).to_csv('clean_data/all_probs_test_df.csv')

In [12]:
all_probs_train_df = pd.read_csv('clean_data/all_probs_train_df.csv')
all_probs_test_df = pd.read_csv('clean_data/all_probs_test_df.csv')

all_probs_train_df = reduce_mem_usage(all_probs_train_df)
all_probs_test_df = reduce_mem_usage(all_probs_test_df)

all_probs_train_df.shape, all_probs_test_df.shape

Memory usage of dataframe is 7.27 MB
Memory usage after optimization is: 1.82 MB
Decreased by 75.0%
Memory usage of dataframe is 276.31 MB
Memory usage after optimization is: 70.16 MB
Decreased by 74.6%


((15120, 63), (565892, 64))

In [21]:
# These columns seem to be hurting the models
kn_cols = all_probs_train_df.columns[all_probs_train_df.columns.str.startswith('KNeighborsClassifier')].tolist()
lda_cols = all_probs_train_df.columns[all_probs_train_df.columns.str.startswith('LinearDiscriminantAnalysis')].tolist()
kn_cols, lda_cols

(['KNeighborsClassifier_CLASS_1',
  'KNeighborsClassifier_CLASS_2',
  'KNeighborsClassifier_CLASS_3',
  'KNeighborsClassifier_CLASS_4',
  'KNeighborsClassifier_CLASS_5',
  'KNeighborsClassifier_CLASS_6',
  'KNeighborsClassifier_CLASS_7'],
 ['LinearDiscriminantAnalysis_CLASS_1',
  'LinearDiscriminantAnalysis_CLASS_2',
  'LinearDiscriminantAnalysis_CLASS_3',
  'LinearDiscriminantAnalysis_CLASS_4',
  'LinearDiscriminantAnalysis_CLASS_5',
  'LinearDiscriminantAnalysis_CLASS_6',
  'LinearDiscriminantAnalysis_CLASS_7'])

In [18]:
test_id = all_probs_test_df.Id
test_x = all_probs_test_df.drop('Id',axis=1)
seed = 1234

In [23]:
# 79%
lgbm = LGBMClassifier(n_estimators=100, max_depth = 10, colsample_bytree=0.1, random_state=seed)
lgbm.fit(all_probs_train_df.drop(kn_cols,axis=1).values, ytrain)
preds_gbm = lgbm.predict(test_x.drop(kn_cols,axis=1).values)
pd.DataFrame({'Id': test_id, 'Cover_Type':preds_gbm}).to_csv('Submissions/Stacked_LGBM.csv',index=False)

  if diff:


In [24]:
#78%
xgb = XGBClassifier(n_estimators=10, max_depth = 10, colsample_bytree=0.1, objective='multi:softmax', random_state=seed)
xgb.fit(all_probs_train_df.drop(kn_cols,axis=1).values, ytrain)
preds_xgb = xgb.predict(test_x.drop(kn_cols,axis=1).values)
pd.DataFrame({'Id': test_id, 'Cover_Type':preds_xgb}).to_csv('Submissions/Stacked_xgb.csv',index=False)

  if diff:


In [25]:
# 76%
rf = RandomForestClassifier(n_estimators=50, max_depth = 10, max_features=0.1, random_state=seed)
rf.fit(all_probs_train_df.values, ytrain)
preds_rf = rf.predict(test_x.values)
pd.DataFrame({'Id': test_id, 'Cover_Type':preds_rf}).to_csv('Submissions/Stacked_rf.csv',index=False)

In [273]:
#%
lreg = LogisticRegression(penalty='l1', C=0.25, random_state=seed)
lreg.fit(all_probs_train_df.values, ytrain)
preds_lreg = lreg.predict(test_x.values)
pd.DataFrame({'Id': test_id, 'Cover_Type':preds_lreg}).to_csv('Submissions/Stacked_lreg.csv',index=False)

In [282]:
svc = SVC()
svc.fit(all_probs_train_df.values, ytrain)
preds_svc = svc.predict(test_x.values)
pd.DataFrame({'Id': test_id, 'Cover_Type':preds_svc}).to_csv('Submissions/Stacked_svc.csv',index=False)

In [26]:
pd.DataFrame({
    'Id':test_id, 
    'Cover_Type': np.ravel(stats.mode(np.vstack([preds_gbm,preds_xgb, preds_rf]).T, 1)[0])
}).to_csv('Submissions/stacked_trees_maj_vote.csv', index=False)