In [None]:
# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# System imports
import numpy as np
import pandas as pd
import copy

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Utilities
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [None]:
# Generate classification data    
# Local environment
#data_path = '../../data/learn-together'

# Kaggle
data_path = '../input/learn-together'

df_test = pd.read_csv(data_path + '/test.csv')
df_sample_submission = pd.read_csv(data_path + '/sample_submission.csv')
df = pd.read_csv(data_path + '/train.csv')

target = 'Cover_Type'
features = list(df.columns)
features.remove(target)

X = df[features]
y = df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=5)

In [None]:
# Define Base (level 0) and Stacking (level 1) estimators
base_models = [
                AdaBoostClassifier(random_state=5)
                , RandomForestClassifier(n_estimators=100, random_state=5)
                , KNeighborsClassifier()
                , XGBClassifier(random_state=5)
                , SVC(probability=True, random_state=5, gamma='scale')
              ]
            
stack_model = RandomForestClassifier(n_estimators=600, random_state=5)

In [None]:
# Evaluate Base estimators separately
#for model in base_models:
    
    # Fit model
#    model.fit(X_train, y_train)
    
    # Predict
#    y_pred = model.predict(X_val)
    
    # Calculate accuracy
#    acc = accuracy_score(y_val, y_pred)
#    print('{} Accuracy: {:.2f}%'.format(model.__class__.__name__, acc * 100))

In [None]:
# Create first level predictions (meta-features)
def hold_out_predict(clf, X, y, cv):
        
    """Performing cross validation hold out predictions for stacking"""
    # Initilize
    n_classes = len(np.unique(y)) # Assuming that training data contains all classes
    meta_features = np.zeros((X.shape[0], n_classes)) 
    n_splits = cv.get_n_splits(X, y)
    
    # Loop over folds
    print("Starting hold out prediction with {} splits for {}.".format(n_splits, clf.__class__.__name__))
    for train_idx, hold_out_idx in cv.split(X): 
        
        # Split data
        X_train = X.iloc[train_idx]    
        y_train = y.iloc[train_idx]
        X_hold_out = X.iloc[hold_out_idx]

        # Fit estimator to K-1 parts and predict on hold out part
        est = copy.deepcopy(clf)
        est.fit(X_train, y_train)
        y_hold_out_pred = est.predict_proba(X_hold_out)
        
        # Fill in meta features
        meta_features[hold_out_idx] = y_hold_out_pred

    return meta_features

In [None]:
# Create first level predictions (meta-features) from training data

# Define 4-fold CV
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True)

# Loop over classifier to produce meta features
meta_train = pd.DataFrame()
for model in base_models:
    
    # Create hold out predictions for a classifier
    meta_train_model = hold_out_predict(model, X_train, y_train, kfold)
    #print(pd.DataFrame(meta_train_model).head())
    
    # Gather meta training data
    meta_train = pd.concat([meta_train, pd.DataFrame(meta_train_model)], axis=1)
    #print(pd.DataFrame(meta_train).head())
    
#print(pd.DataFrame(meta_train).head())

In [None]:
# Predict on Stacking Classifier

# Set seed
#if 'random_state' in stack_model.get_params().keys():
#    stack_model.set_params(random_state=SEED)

# Optional (Add original features to meta)
#original_flag = False
#if original_flag:
#    meta_train = np.concatenate((meta_train, X_train), axis=1)
#    meta_val = np.concatenate((meta_val, X_val), axis=1)

# Fit model
stack_model.fit(meta_train, y_train)

In [None]:
# Create meta-features for testing data
meta_val = pd.DataFrame()
for model in base_models:
    
    # Create hold out predictions for a classifier
    model.fit(X_train, y_train)
    meta_val_model = model.predict_proba(df_test)
    
    # Gather meta training data
    meta_val = pd.concat([meta_val, pd.DataFrame(meta_val_model)], axis=1)

In [None]:
# Final output
preds = stack_model.predict(meta_val)

# Save test predictions to file
output = pd.DataFrame({'Id': df_sample_submission.Id,
                   'Cover_Type': preds})
output.head()
output.to_csv('submission.csv', index=False)