# Questions I am Exploring #

Looking for some support with the community.  I can't quite figure out why:
* My PCA based model is spitting out all 3's.  
* My model performance is so bad on the public leaderboard (0.1-0.25, not close to what I am getting in CV).
* My submissions no longer work.


In [None]:
# Parameters for my workflow for more easeful experimenting (instead of commenting out code)
# Also gives a great overall picture of what I'm including

VERBOSITY = 0
HIDE_CODE_CELLS = False
IGNORE_WARNINGS = True

# Baseline model before any major cleaning or feature engineering
DO_BASELINE_MODEL = True

# Data Cleaning & Class Imbalance
VALIDATE_DATA = False

MERGE_ON_COLS = ['installation_id', 'game_session']  # This indicates what constitutes an 'observation'
RESAMPLE_TRUE_DATA = False
SHOW_CLASS_BALANCE = False
COMPUTE_CLASS_WEIGHTS = False
IMPUTE_VALUES = True
IMPUTE_FILLNA_VAL = 0
IMPUTE_METHOD = 'fill_na'

# Semi Supervised Learning
LABEL_UBLABELLED_DATA = True

# Exploratory Data Analysis

RESAMPLE_EDA_DATA = True
DO_PANDAS_PROFILE = False
EXPORT_PANDAS_PROFILE = False

# Exploratory Clustering

EXPLORE_WITH_HIERARCHICAL_C = False
HIERARCHICAL_METHOD = ''
EXPLORE_WITH_PCA = False
EXPLORE_WITH_TSNE = True
EXPLORE_WITH_DBSCAN = False
DO_CO_OCCURENCE_MATRIX = False
ENSEMBLE_CLUSTERS = False
cluster_parameter_dict = {}

# Explanatory Data Analysis
SHOW_EXPLANATORY_DATA = False

# Feature Engineering
BINARIZE_FEATURES = False
BIN__NUMERICAL_FEATURES = False
ENCODE_CATEGORICAL = False
GROUPBY_COUNTS_ON_TARGET = True  # ?
GENERATE_CATEGORICAL_COUNTS = True
TARGET_ENCODE_CATEGORIES = False
CALCULATE_POLYNOMIAL_FEATURES = False
CALCULATE_SIMILARITY_FEATURES = False
EMBED_PREDICTIONS_AS_FEATURES = False
EMBED_TEST_FIT_CLUSTER_ON_TRAIN_FEATURE = False
ADD_NOISY_FEATURE = False
FEATURES_WITH_POTENTIAL_RELATIONSHIPS = []

# NLP Engineering
ENGINEER_NLP_FEATURES = False
PSEUDO_LABEL = False

# Time Series Engineering
CREATE_DATETIME_FEATURES = False
ENGINEER_TIME_SERIES_FEATURES = True

# Feature Preprocessing
SCALE_METHOD = "min_max"
SCALE_FEATURES = True

# Feature Selection
REDUCE_TO_N_FEATURES = 30  # 
USE_BEST_FEATURES_ONLY = False
SELECT_WITH_PCA = True
PCA_N_COMPONENTS = 30
SELECT_WITH_LDA = False
LDA_N_COMPONENTS = REDUCE_TO_N_FEATURES
N_FEATURES = 30

# Cross Validation and Model Validation
MIMIC_KAGGLE_TRAIN_TEST = False
DO_TRAIN_TEST_SPLIT = True
TRAIN_TEST_PERCENTAGE = 0.75
DO_KFOLD = True
COMPETITION_METRIC = 'kappa_quadratic'
PLOT_DECISION_REGIONS = False

# Models
DO_LINEAR_REGRESSION = False
DO_LOGISTIC_REGRESSION = False

DO_RANDOM_FOREST = False
DO_CATBOOST = True
DO_LIGHTBOOST = True
DO_XGBOOST = True

# Deep Learning
USE_DEEP_LEARNING = False
USE_AUTO_KERAS = False
NN_LOSS_FUNCTION = 'categorical_cross_entropy'
NN_OPTIMIZER = 'adam'
NN_N_LAYERS = 8
NN_BATCH_SIZE = 32
NN_N_NODES = 100

# Ensembling
USE_VOTING_CLASSIFIER = False
USE_STACKING = False

# Hyperparameter Tuning
DO_GRID_SEARCH = False
USE_TPOT = False
USE_AUTO_KERAS = False

# Submission
CREATE_SUBMISSION = True
SUBMISSION_CLF = 'cat_clf'


In [None]:
import itertools

from warnings import filterwarnings
filterwarnings('ignore')

import numpy as np # linear algebra
import scipy
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
plt.rcParams["figure.figsize"] = (16,12)

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.options.display.max_rows = None
pd.options.display.max_columns = None

import seaborn as sns
sns.set()  # This make matplotlib plots look like seaborn plots.
sns.set_context("talk")

import plotly.express as px
import plotly.graph_objects as go

# Stats
from scipy.stats import zscore

# EDA and Unsupervised Exploring and Feature Visualization
import pandas_profiling
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import missingno as msno  # Missing Data
from yellowbrick.features import Manifold, ParallelCoordinates
from yellowbrick.datasets import load_occupancy

# Class Balancing
from imblearn.under_sampling import ClusterCentroids, NearMiss

# Resampling
from sklearn.utils import resample

# Feature Processing and Engineering
from sklearn.feature_extraction.text import CountVectorizer  # NLP
from sklearn.preprocessing import PolynomialFeatures, label_binarize, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

# Models 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Ensembling
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline

# Dimensionality Reduction
if SELECT_WITH_PCA == True | EXPLORE_WITH_PCA == True:
    from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

# Model Validation and Metrics
from mlxtend.plotting import plot_decision_regions
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics.cluster import homogeneity_score


# from imblearn.ensemble import BalancedRandomForestClassifier

#AutoML
from tpot import TPOTClassifier

# Yellowbrick Reports
from yellowbrick.classifier import ClassBalance, ClassificationReport, ConfusionMatrix, ROCAUC, DiscriminationThreshold
from yellowbrick.model_selection import FeatureImportances, LearningCurve, RFECV
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance

# Optimization
from numba import jit 



In [None]:
def save_current_fig(filename):
    # Save a figure to the output path.
    
    plt.gcf()
    plt.savefig(filename)

## If Loading The Data From Saved External Dataframe ##

## If Loading The Data in Real Time ##

In [None]:
# Data Import
SAMPLE_SIZE = 500000000000
sample_size = SAMPLE_SIZE # Remove this redundancy
CHUNK_SIZE = 2000
EXPORE_MEM_TYPES = False
REDUCE_MEMORY = True
LOAD_ON_KAGGLE = True

# Data Import Specifics
datafiles = ['sample_submission', 'train_labels', 'train', 'test']
dataframe_names = ['sample_submission', 'train_labels', 'train', 'test']  # Remove this redundancy!

csv_filenames = map(lambda  x: str(x) + '.csv', datafiles)
index_columns = [None, 'installation_id', 'installation_id', 'installation_id']
datetime_cols = [None, 'timestamp', 'timestamp', 'timestamp']
thepath = '/kaggle/input/data-science-bowl-2019/'

CATEGORICAL_VARIABLES = []
NUMERICAL_VARIABLES = []
ORDINAL_VARIABLES = []
TARGET_VARIABLE = "accuracy_group"

**Loading Data in chunks and processing per ID to save ram**

In [None]:
"""
# Load Preserving Ram
# Still overloads the ram!!!!
# I Think test_df = test_df.append() is the problem.  
# Just using append wasn't doing anything.

# Use a with statement to close after done with the chunk?

df_dict = {}
filepath = '/kaggle/input/data-science-bowl-2019/train.csv'

# Read one column to get the column names.
df_info = pd.read_csv(filepath, nrows = 1, index_col = 'installation_id')
col_names = df_info.columns

read_chunk = pd.read_csv(filepath, 
                         chunksize = 50000, 
                         index_col = 'installation_id')


while True:
    try:
        df_dict.update(next(read_chunk))
    except StopIteration:
        break

df = pd.DataFrame.from_dict(data = df_dict)
print(df.shape)
"""

**Load all at once**

In [None]:
def read_csv(datafiles, path = thepath):
    return [pd.read_csv(str(path + datafiles[df_i] + '.csv'), 
                        nrows = sample_size,
                        index_col=index_columns[df_i])
            for df_i in range(len(datafiles))]

# Unpack
sample_submission, train_labels, train, test = read_csv(datafiles)


In [None]:
# Convert to datetimes.
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

## Update memory types ##

In [None]:
# Updated function for converting memory types.  DRY method.   
# Not active yet.

mem_updates_dict = {'train_labels': {'title':'category',
                               'num_correct': 'np.int8',
                               'num_incorrect': 'np.int8',
                               'accuracy': 'np.float16',
                               'accuracy_group': 'np.int8'},
                      'train': {'type':'category',
                               'world':'category',
                               'event_count':'np.int8',
                               'event_code':'np.int8',
                               'game_time':'np.int8'},
                      'test': {'type':'category',
                               'world':'category',
                               'event_count':'np.int8',
                               'event_code':'np.int8',
                               'game_time':'np.int8'}}

def convert_memtypes(memdict):
    pass


In [None]:
# Convert memory types WET method.

train_labels['title'] = train_labels['title'].astype('category')
train_labels['num_correct'] = train_labels['num_correct'].astype(np.int16)
train_labels['num_incorrect'] = train_labels['num_incorrect'].astype(np.int16)
train_labels['accuracy'] = train_labels['accuracy'].astype(np.float16)
train_labels['accuracy_group'] = train_labels['accuracy_group'].astype(np.int16)

train['type'] = train['type'].astype('category')
test['type'] = test['type'].astype('category')

train['world'] = train['world'].astype('category')
test['world'] = test['world'].astype('category')

train['event_code'] = train['event_code'].astype('category')
test['event_code'] = test['event_code'].astype('category')

# train['event_count'] = train['event_count'].astype(np.int8)
# test['event_count'] = test['event_count'].astype(np.int8)

train['game_time'] = train['game_time'].astype(np.int)
test['game_time'] = test['game_time'].astype(np.int)


### Do a quick look at the distributions of important factors ###

### Create Datetime Features + Do Timeseries Statistics ###

In [None]:
def extract_time_features(df):
# Inspired by Gabriel Preda: https://www.kaggle.com/gpreda/2019-data-science-bowl-eda
    df['date'] = df['timestamp'].dt.date.astype('category')
    df['month'] = df['timestamp'].dt.month.astype('category')
    df['hour'] = df['timestamp'].dt.hour.astype('category')
    df['year'] = df['timestamp'].dt.year.astype('category')
    df['day_of_week'] = df['timestamp'].dt.dayofweek.astype('category')
    df['week_of_year'] = df['timestamp'].dt.weekofyear.astype('category')
    df['day_of_year'] = df['timestamp'].dt.dayofyear.astype('category')
    df['quarter'] = df['timestamp'].dt.quarter.astype('category')
    df['is_month_start'] = df['timestamp'].dt.is_month_start.astype('category')
    return df

In [None]:
#train = extract_time_features(train)
#test = extract_time_features(test)
#train.head()

### Preliminary cleaning of outliers in original sets before aggregating ###

In [None]:
def remove_outliers(df, low_q = 0, high_q = 1):  # Does better without removing outliers
    
    # Chose 0 and 1 to bypass without changing code everywhere.
    # Check to see if outliers exist.  Will return an error if they don't.
    # Right now, nothing should be passed in the test set
    # In the future, update so that only df.index.isin(train.index)
    # Are removed.
    
    print('Original shape', df.shape)
    outliers_removed = pd.DataFrame()
    
    for column in df.columns:
        # Do this only if column is numeric
        
        try:
            q1 = df[column].quantile(low_q)
            q3 = df[column].quantile(high_q)
            mask = df[column].between(q1, q3, inclusive=True)

            iqr = df.loc[mask, column]
            outliers_removed[column] = iqr
        except:
            print('Did not remove outliers from' + column)
        finally:
            print('After removing outliers: ', outliers_removed.shape)
            
    return outliers_removed




Display some visuals here.

** Model does better if we leave 0 game times in**

In [None]:
REMOVE_0_GAME_TIME = False # Does better when not removed.

if REMOVE_0_GAME_TIME == True:
    # Remove game times of 0 since these will not likely contain useful information.
    train_is_game_time_0 = train['game_time'] == 0
    train_is_not_game_time_0 = train['game_time'] != 0
    train = train[train_is_not_game_time_0]

    print('Training set:')
    print(train_is_game_time_0.sum())
    print(train.shape[0])
    print(round(train_is_game_time_0.sum() / train.shape[0] * 100, 2), '% of entries in train have 0 game time')
    print('\n')

    # Does the test set have any 0 game time entries?
    print('Testing set:')
    test_is_game_time_0 = test['game_time'] == 0
    print(test_is_game_time_0.sum())
    print(test.shape[0])
    print(round(test_is_game_time_0.sum() / test.shape[0] * 100, 2), '% of entries in test have 0 game time')

    # Should we just assign these entries as accuracy of 0?

    # Maybe they just left the games on and weren't actually playing?
    train_gt_is_not_too_long = train.game_time < train.game_time.quantile(.9999)
    train = train[train_gt_is_not_too_long]

    # Do note remove from test set!!!
    # test_gt_is_not_too_long = test.game_time < test.game_time.quantile(.9999)
    # test = test[test_gt_is_not_too_long]

    # train = remove_outliers(train, low_q = 0, high_q = .9999)  # this should remove the 'problem' data points.

    # Do not remove from test set!!!
    # test = remove_outliers(test, low_q = 0, high_q = .9999)  # this should remove the 'problem' data points.

## Feature Engineering ##

### Do some Groupby's on the whole dataset before I focus in on assessments ###

In [None]:
def groupby_categorical_counts(df, df_name, by_col, on_cols, stat_features = True, verbose = 1):
    # Inputs a categorical column in a dataframe and 
    # Returns the count of that column of each unique value.
    # on_cols must be a list of cols.  If one col, pass ['col'].
    
    added_features = []
    total_features = 0
    
    for i in range(len(on_cols)):
        
        on_col = on_cols[i]  # Set the column to the i'th element in the list.
        
        # Do Groupby value_counts.
        if verbose == 1:
            print("Creating groupby counts", by_col, "on", on_col, 'in dataframe', df_name)

        added_feature = df.groupby(by_col)[on_col].value_counts().unstack().fillna(0)
        n_new_features = len(added_feature.columns)
        
        added_features.append(added_feature)
        total_features = total_features + n_new_features
        
        if verbose == 1:
            print("Added: ", n_new_features, "features from", df_name, "Filled Na with 0 \n")        
    
    if verbose == 1:
        print("Total features returned:", total_features)
        
    return added_features

In [None]:
# BASELINE_WITH_SCALING = True
CATEGORICAL_COLS_TO_GROUPBY = ['event_code', 
                               'title', 'type', 
                               'world', 'event_id']

# add_later = [, 'is_month_start', 'quarter']

In [None]:
if DO_BASELINE_MODEL == True:
    print('Creating categorical features for the training set')    
    tr_baseline_event_code, tr_baseline_title, tr_baseline_type, tr_baseline_world, tr_baseline_event_id = groupby_categorical_counts(df = train, 
                                                                                df_name = 'train', 
                                                                                by_col = 'installation_id', 
                                                                                on_cols = CATEGORICAL_COLS_TO_GROUPBY)

In [None]:
if DO_BASELINE_MODEL == True:
    print('\n\n Creating categorical features for the test set')
    test_baseline_event_code, test_baseline_title, test_baseline_type, test_baseline_world, test_baseline_event_id = groupby_categorical_counts(df = test, 
                                                                            df_name = 'test', 
                                                                            by_col = 'installation_id', 
                                                                            on_cols = CATEGORICAL_COLS_TO_GROUPBY)

In [None]:
# Work only with the assessments for now. Training data.

is_assessment_train = train['type'] == 'Assessment'
train = train[is_assessment_train]

# All other assessments.


# Work only with the assessments for now. Training data.
is_assessment_test = test['type'] == 'Assessment'
test = test[is_assessment_test]

# Find the strings that indicate 'correct and incorrect' assessments
is_correct = '"correct":true'
is_incorrect = '"correct":false'

train['num_correct'] = train.loc[:, 'event_data'].str.find(is_correct)
train['num_correct'] = train['num_correct'] >= 0
train['num_correct'] = train['num_correct'].astype(int)

train['num_incorrect'] = train.loc[:, 'event_data'].str.find(is_incorrect)
train['num_incorrect'] = train['num_incorrect'] >= 0
train['num_incorrect'] = train['num_incorrect'].astype(int)

test['num_correct'] = test.loc[:, 'event_data'].str.find(is_correct)
test['num_correct'] = test['num_correct'] >= 0
test['num_correct'] = test['num_correct'].astype(int)

test['num_incorrect'] = test.loc[:, 'event_data'].str.find(is_incorrect)
test['num_incorrect'] = test['num_incorrect'] >= 0
test['num_incorrect'] = test['num_incorrect'].astype(int)

# Collect bird train assessments
# is_bird_train = train['event_code'] == 4110
# bird_assess_train = pd.DataFrame(train[is_bird_train])

# Collect bird train assessments
# is_bird_test = test['event_code'] == 4110
# bird_assess_test = pd.DataFrame(test[is_bird_test])

In [None]:
# Aggregate the num_correct and num_incorrect by user, game sesion, and title
tr_baseline_num_correct = train.groupby(['installation_id', 'game_session', 'title'])['num_correct'].sum().fillna(0).astype(int)
tr_baseline_num_incorrect = train.groupby(['installation_id', 'game_session', 'title'])['num_incorrect'].sum().fillna(0).astype(int)

In [None]:
# Make dataframes out of these for concatenation next.
tr_num_correct_df = pd.DataFrame(tr_baseline_num_correct.unstack().fillna(0).astype(int))
tr_num_incorrect_df = pd.DataFrame(tr_baseline_num_incorrect.unstack().fillna(0).astype(int))

### Concatenate num correct and num incorrect and calculate accuracy like train_labels ###

In [None]:
# This is validated.  Calculate the accuracy per game and then do statistics on it.
train_accuracy_per_game = tr_baseline_num_correct / (tr_baseline_num_correct + tr_baseline_num_incorrect)

In [None]:
train_accuracy_avg = train_accuracy_per_game.groupby(['installation_id']).mean().fillna(0).rename('accuracy', inplace = True).astype(float)
train_accuracy_std = train_accuracy_per_game.groupby(['installation_id']).std().fillna(0).rename('accuracy_std', inplace = True).astype(float)
train_accuracy_std = train_accuracy_per_game.groupby(['installation_id']).std().fillna(0).rename('accuracy_std', inplace = True).astype(float)

train_accuracy_min = train_accuracy_per_game.groupby(['installation_id']).min().fillna(0).rename('accuracy_min', inplace = True).astype(float)
train_accuracy_max = train_accuracy_per_game.groupby(['installation_id']).max().fillna(0).rename('accuracy_max', inplace = True).astype(float)

In [None]:
# Sum totals of num correct and num incorrect.
tr_baseline_num_correct = tr_baseline_num_correct.groupby(['installation_id']).sum().fillna(0).rename('num_correct', inplace = True)
tr_baseline_num_incorrect = tr_baseline_num_incorrect.groupby(['installation_id']).sum().fillna(0).rename('num_incorrect', inplace = True)

In [None]:
train_accuracy_per_game.head()

In [None]:
# Other statistics on accuracy
train_accuracy_gmean = train_accuracy_per_game.groupby(['installation_id']).agg(scipy.stats.gmean).fillna(0).rename('accuracy_gmean', inplace = True).astype(float)
train_accuracy_skew = train_accuracy_per_game.groupby(['installation_id']).agg(scipy.stats.skew).fillna(0).rename('accuracy_skew', inplace = True).astype(float)
train_accuracy_kurt = train_accuracy_per_game.groupby(['installation_id']).agg(scipy.stats.kurtosis).fillna(0).rename('accuracy_kurtosis', inplace = True).astype(float)

train_statistics_cols = [train_accuracy_gmean, train_accuracy_skew, train_accuracy_kurt]


In [None]:
tr_baseline_num_correct_eventid = train.groupby(['installation_id','event_id'])['num_correct','num_incorrect'].sum().unstack(fill_value = 0)
tr_baseline_num_correct_eventid.columns = tr_baseline_num_correct_eventid.columns.map(''.join).str.strip('')

test_baseline_num_correct_eventid = test.groupby(['installation_id','event_id'])['num_correct','num_incorrect'].sum().unstack(fill_value = 0)
test_baseline_num_correct_eventid.columns = test_baseline_num_correct_eventid.columns.map(''.join).str.strip('')

In [None]:
train_num_correct_per_title = train.groupby(['installation_id', 'title'])['num_correct','num_incorrect'].sum().unstack(fill_value = 0)
train_num_correct_per_title.columns = train_num_correct_per_title.columns.map('_'.join).str.strip('_')

test_num_correct_per_title = test.groupby(['installation_id', 'title'])['num_correct','num_incorrect'].sum().unstack(fill_value = 0)
test_num_correct_per_title.columns = test_num_correct_per_title.columns.map('_'.join).str.strip('_')

In [None]:
tr_num_correct_per_world = train.groupby(['installation_id', 'world'])['num_correct','num_incorrect'].sum().unstack().fillna(0)
tr_num_correct_per_world.columns = tr_num_correct_per_world.columns.map('_'.join).str.strip('_')

test_num_correct_per_world = test.groupby(['installation_id', 'world'])['num_correct','num_incorrect'].sum().unstack().fillna(0)
test_num_correct_per_world.columns = test_num_correct_per_world.columns.map('_'.join).str.strip('_')
tr_num_correct_per_world.head()

In [None]:
test_baseline_num_correct = test.groupby(['installation_id', 'game_session', 'title'])['num_correct'].sum().fillna(0).astype(int)
test_baseline_num_incorrect = test.groupby(['installation_id', 'game_session', 'title'])['num_incorrect'].sum().fillna(0).astype(int)

test_accuracy_per_game = test_baseline_num_correct / (test_baseline_num_correct + test_baseline_num_incorrect)


test_accuracy_avg = test_accuracy_per_game.groupby(['installation_id']).mean().fillna(0).rename('accuracy', inplace = True).astype(float)
test_accuracy_std = test_accuracy_per_game.groupby(['installation_id']).std().fillna(0).rename('accuracy_std', inplace = True).astype(float)
test_accuracy_min = test_accuracy_per_game.groupby(['installation_id']).min().fillna(0).rename('accuracy_min', inplace = True).astype(float)
test_accuracy_max = test_accuracy_per_game.groupby(['installation_id']).max().fillna(0).rename('accuracy_max', inplace = True).astype(float)

test_baseline_num_correct = test_baseline_num_correct.groupby(['installation_id']).sum().fillna(0).rename('num_correct', inplace = True)
test_baseline_num_incorrect = test_baseline_num_incorrect.groupby(['installation_id']).sum().fillna(0).rename('num_incorrect', inplace = True)

test_accuracy_gmean = test_accuracy_per_game.groupby(['installation_id']).agg(scipy.stats.gmean).fillna(0).rename('accuracy_gmean', inplace = True).astype(float)
test_accuracy_skew = test_accuracy_per_game.groupby(['installation_id']).agg(scipy.stats.skew).fillna(0).rename('accuracy_skew', inplace = True).astype(float)
test_accuracy_kurt = test_accuracy_per_game.groupby(['installation_id']).agg(scipy.stats.kurtosis).fillna(0).rename('accuracy_kurtosis', inplace = True).astype(float)

test_statistics_cols = [test_accuracy_gmean, test_accuracy_skew, test_accuracy_kurt]

In [None]:
# Add and sort the event_id columns for training and test sets.

print('Are there missing event ID counts in the test variable to be concatenated later?')
print('train_baseline_event_id,shape', tr_baseline_event_id.shape)
print('test_baseline_event_id,shape', test_baseline_event_id.shape)

print('\nAdding to test event_id counts')

# Loop through missing columns and broadcast them.
event_ids_add_to_test = list(set(tr_baseline_event_id).difference(set(test_baseline_event_id)))
for col in event_ids_add_to_test:
    test_baseline_event_id[col] = 0

print('train_baseline_event_id,shape', tr_baseline_event_id.shape)
print('test_baseline_event_id,shape', test_baseline_event_id.shape)

print('\nAdding to test num_correct_event_id')

print('tr_baseline_num_correct_eventid', tr_baseline_num_correct_eventid.shape)
print('test_baseline_num_correct_eventid', test_baseline_num_correct_eventid.shape)

num_correct_event_ids_add_to_test = list(set(tr_baseline_num_correct_eventid).difference(set(test_baseline_num_correct_eventid)))
for col in num_correct_event_ids_add_to_test:
    test_baseline_num_correct_eventid[col] = 0
    
print('tr_baseline_num_correct_eventid', tr_baseline_num_correct_eventid.shape)
print('test_baseline_num_correct_eventid', test_baseline_num_correct_eventid.shape)

print('\nSorting the Columns...')
tr_baseline_event_id.sort_index(axis=1, inplace=True)
test_baseline_event_id.sort_index(axis=1, inplace=True)
tr_baseline_num_correct_eventid.sort_index(axis=1, inplace=True)
test_baseline_num_correct_eventid.sort_index(axis=1, inplace=True)
print('Complete')

In [None]:
def convert_to_accuracy_group(the_series):
    if the_series == 0:
        return 0
    elif the_series > 0 and the_series < 0.5:
        return 1
    elif the_series > 0 and the_series >= 0.5 and the_series < 0.75:
        return 2
    elif the_series >= 0.75:
        return 3

In [None]:
y_train_estimate = train_accuracy_avg.apply(convert_to_accuracy_group).rename('accuracy_group_estimate', inplace = True).fillna(0).astype(int)
y_test_estimate = test_accuracy_avg.apply(convert_to_accuracy_group).rename('accuracy_group_estimate', inplace = True).fillna(0).astype(int)

In [None]:
# Calculate - 'Got the Third One'
# if event_count == 3 and type == 'Assessment':
# Check if success
# "won first assessment."

In [None]:
# Calculate - Only got it after the third one.
# else won after third try.

## Concatenate everything to train/test_baseline DataFrames

In [None]:
from sklearn.metrics import f1_score

def aggregate_targets_on_id(y):
    # Aggregate by the mode.  
    # Preserves structure of distribution most.
    
    mode_agg = lambda x: scipy.stats.mode(x)[0]
    aggregation_function = mode_agg

    # Just aggregate the y_train data. 
    return y.groupby(['installation_id'])['accuracy_group'].agg(aggregation_function)

In [None]:
# Aggregate targets
if DO_BASELINE_MODEL == True:
    print('Aggregating training set with the MODE so there is one row per ID')
    train_y_baseline = aggregate_targets_on_id(train_labels)
    print(train_y_baseline.shape)
    print('assigning this to y')
    
    # Code smell - organize this so I don't need to assign it twice.
    y = train_y_baseline
    y_train = train_y_baseline

In [None]:
# Concatenate all features engineering to train dataframe.

if DO_BASELINE_MODEL == True:
    print('Concatenating engineered features to the train_baseline')
    
    train_features = [tr_num_correct_per_world, tr_baseline_event_code, 
                      tr_baseline_title, tr_baseline_type, tr_baseline_world, 
                      tr_baseline_event_id, tr_baseline_num_correct, 
                      tr_baseline_num_incorrect, tr_baseline_num_correct_eventid, 
                      train_accuracy_avg, train_accuracy_std, train_accuracy_min, 
                      train_accuracy_max, train_num_correct_per_title, y_train_estimate,
                      train_accuracy_gmean, train_accuracy_skew, train_accuracy_kurt]
    
    train_baseline = pd.concat(train_features, axis = 1)
    train_baseline.fillna(0, inplace = True)
    print(train_baseline.shape)
    train_baseline.head()


In [None]:
# Concatenate Test Set

if DO_BASELINE_MODEL == True:
    # print('\nRemoving Outliers...')
    # train_baseline = train_baseline.fillna(0)
    # train_baseline = remove_outliers(train_baseline)
    

    print('\nConcatenating engineered features to the test_baseline')
    test_baseline = pd.concat([test_num_correct_per_world, test_baseline_event_code, 
                               test_baseline_title, test_baseline_type, 
                               test_baseline_world, test_baseline_event_id, 
                               test_baseline_num_correct, test_baseline_num_incorrect, 
                               test_baseline_num_correct_eventid, test_accuracy_avg, 
                               test_accuracy_std, test_accuracy_min, test_accuracy_max, 
                               test_num_correct_per_title, y_test_estimate,
                               test_accuracy_gmean, test_accuracy_skew, test_accuracy_kurt], axis = 1)

    print('test_baseline.shape' , test_baseline.shape)

### Remove features that are not represented in the train set ###

In [None]:
cols_in_train = list(train_baseline.columns)
len(cols_in_train)

In [None]:
print(test_baseline.shape)
test_baseline = test_baseline[cols_in_train]
print(test_baseline.shape)

### Split into labelled and unlabelled train for semi-supervised learning ###

In [None]:
    print('\nStoring unlabelled data in the training seet as: train_baseline_labelled for semi-supervised learning')
    train_baseline_labelled = train_baseline[train_baseline.index.isin(train_y_baseline.index.unique())]
    train_unlabelled = train_baseline[~train_baseline.index.isin(train_y_baseline.index.unique())]
    
    print('train_unlabelled shape', train_unlabelled.shape)
    print('train_labelled shape', train_baseline_labelled.shape)
    
    # print('Using generated targets')
    # train_y_baseline = y_train_estimate
    
    # print('\nOnly including samples in train which have accuracy group labels in train_labels')
    # print(train_y_baseline.shape)
    # train_y_baseline = train_y_baseline[train_y_baseline.index.isin(train_baseline.index.unique())]
    # print(train_y_baseline.shape)

### Impute Missing Values ###

In [None]:
import missingno as msno

# Impute test with 0's for now.
test_baseline = test_baseline.fillna(0)
# ax = msno.matrix(test_baseline.fillna(0))


In [None]:
def impute_missing_vals(df):
    pass

In [None]:
train_baseline_labelled.head()

In [None]:
train_unlabelled.head()

### Scale features for models that require scaling  ###

In [None]:
# Assign current working dataset as training set X
X_train = train_baseline_labelled
X_test = test_baseline

SCALE_FEATURES = False

if SCALE_FEATURES == True:
    print('Generating scaled features as X_scaled_train \ntest as X_scaled_test')
    
    X_scaled_train = StandardScaler()
    X_scaled_train = X_scaled_train.fit_transform(X_train)
    print('Sample from X_scaled_train', X_scaled_train[0][:5])
    
    X_scaled_test = StandardScaler()
    X_scaled_test = X_scaled_test.fit_transform(test_baseline)
    print('Sample from X_scaled_test', X_scaled_test[0][:5])
    
else:
    print('X_train and X_test assigned as unscaled feature data')

In [None]:
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, roc_auc_score

def fit_baseline_classifiers(classifiers, X, y, use_polynomial = False, use_scaled = False, do_kfold = False, graph_results = False):
    
    if use_polynomial == True:
        polyX_train = PolynomialFeatures(degree = 2, interaction_only = True)
        X = polyX_train.fit_transform(X)
    
    if use_scaled == True:
        print('Scaling for Classification using MinMax\n')
        X_scaled = MinMaxScaler()
        X_scaled = X_scaled.fit_transform(X)
        X = X_scaled
    
    # For collecting and graphing results at the end.
    all_accuracies = {}
    all_balanced_acc = {}
    all_validation_accuracies = {}
    all_acc_differences = {}
    all_quadratic_kappas = {}
    all_f1_scores = {}
    
    # Loop across each classifier.
    for clf_name, classifier in classifiers:
        
        # Split for validation scoring.
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, shuffle = True)
        
        # Fit the classifier on the labels.
        classifier.fit(X_train, y_train)
        
        # Predict targets after fitting.
        y_pred = classifier.predict(X_test)
        
        # Use .score method if it exists.
        try:
            training_set_accuracy = round(classifier.score(X_train, y_train), 2)
        except:
            training_set_accuracy = None
            print(clf_name, "doesn't have an internal accuracy metric set up")
        
        # Compute metrics
        accuracy = round(accuracy_score(y_test, y_pred), 2)
        balanced_acc = round(balanced_accuracy_score(y_test, y_pred), 2)
        recall_sc = round(recall_score(y_test, y_pred, average = 'weighted'), 2)
        precision_sc = round(precision_score(y_test, y_pred, average = 'weighted'))
        quadratic_kappa = round(cohen_kappa_score(y_test, y_pred, weights="quadratic"))
        the_f1_score = round(f1_score(y_test, y_pred, average = 'weighted'))
        # roc_auc_sc = round(roc_auc_score(y_test, y_pred, average = None))
        
        # Collect all accuracy scores and return the highest one?
        print(clf_name)
        print('Accuracy score - training set:', training_set_accuracy)
        print('Accuracy score - validation set:', accuracy)
        print('Difference of  - training and val sets:', round(training_set_accuracy - accuracy, 2))
        print('Balanced accuracy is', balanced_acc)
        print('Recall (What proportion of actual positives was predicted correctly?):', recall_sc)
        print('Precision (What proportion of positive predictions was actually correct?):', precision_sc)
        print('F1 Score - validation set: ', the_f1_score)
        # print('Area under the ROC curve of', clf_name, 'is', roc_auc_sc)
        print('Quadratic Kappa - validation set - is: ', quadratic_kappa)
        
        if do_kfold == True:
            print('Cross Val Score of', clf_name, ' is: ', cross_val_score(classifier, X, y, cv=3))
        
        print('\n')
        
        if graph_results == True:
            pass
            # Graph this!
    

In [None]:
if DO_BASELINE_MODEL == True:
    # 'Linear' Classifiers
    dum_clf = ('Dummy_Classifier', DummyClassifier())
    logreg_clf = ('Logistic_Regression', LogisticRegression(solver = 'lbfgs', 
                                                            multi_class = 'auto', 
                                                            max_iter = 100))
    svc_clf = ('SVC', SVC(gamma = 'scale'))
    svc_poly = ('SVC_Poly', SVC(kernel = 'poly', degree = 3, C = 5, coef0 = 1, gamma = 'scale'))
    
    # Tree Classifiers
    lb_clf = ('LightGBM', lgb.LGBMClassifier(min_gain_to_split = 0.9,
                                             objective = 'multiclass',
                                             is_unbalance = True,
                                             lambda_l1 = 8))
    rf_clf = ('Random_Forest', RandomForestClassifier(n_estimators = 500,
                                                      min_impurity_decrease =.0065,
                                                      class_weight = 'balanced_subsample'))
    xgb_clf = ('XGBoost', xgb.XGBClassifier(reg_alpha = 10))
    cb_clf = ('Catboost', cb.CatBoostClassifier(verbose=0))
    
    # Lists of these classifiers
    scaled_classifiers = [dum_clf,logreg_clf, svc_clf,]
    tree_classifiers = [lb_clf, rf_clf, xgb_clf]
    quarantined_classifiers = [cb_clf]
    all_classifiers = scaled_classifiers + tree_classifiers + quarantined_classifiers
    
    # Function call to fit.  Set the first position to what classifier list you want to use.
    fit_baseline_classifiers(tree_classifiers, X_train, y_train, use_scaled = True, do_kfold = False)
    

# FastAI Baseline #

In [None]:
DO_FAST_AI = False:
    
if DO_FAST_AI = True:
    from fastai.tabular import *

    dep_var = 'accuracy_group'
    cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
    cont_names = ['education-num', 'hours-per-week', 'age', 'capital-loss', 'fnlwgt', 'capital-gain']
    procs = [FillMissing, Categorify, Normalize]
    
    learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
    learn.fit(5, 1e-2)
    learn.save('mini_train')
    
    learn.show_results()


# Reporting Pipeline for One Classifier #

Best parameters so far
* xgb max depth = ~22
* reg_alpha = 3
* reg_lambda
* n_estimators
* learning_rate
* booster
* gamma 
* importance_type

In [None]:
DO_BASELINE_REPORT = False
DO_VALIDATION_CURVE = False
DO_LEARNING_CURVE = False
DO_CV_SCORE = False
DO_MUTUAL_INFORMATION = False

In [None]:
if DO_BASELINE_REPORT == True:
    
    from yellowbrick.model_selection import ValidationCurve, LearningCurve, CVScores, RFECV
    
    # Reporting parameters applicable to all reports
    DO_BASELINE_REPORT = True
    b_clf_to_study_name = "xgb"
    b_clf_to_study = xgb.XGBClassifier(max_depth = 22, 
                                       reg_alpha = 3)
    cv = StratifiedKFold(n_splits=12)


In [None]:
if DO_BASELINE_REPORT == True & DO_VALIDATION_CURVE == True:

    parameter_to_study = 'reg_lambda'
    param_range_study = np.arange(1, 15, 2)

    # Validation curve to see performance changes on a particular parameter
    val_curve = ValidationCurve(b_clf_to_study,
                               param_name = parameter_to_study,
                               param_range = param_range_study,
                               n_jobs = -1)
    val_curve.fit(X_train, y_train)
    val_curve.finalize()
    val_curve.show()
    
    try:
        save_current_fig(str(b_clf_to_study_name + '_' + 'val_curve.jpg'))
    except:
        pass

In [None]:
if DO_BASELINE_REPORT == True & DO_LEARNING_CURVE == True:
    
    train_sizes = np.linspace(0.3, 1.0, 10) # in percentages
    
    # Learning curve to see performance changes with increased sample size.
    cv = StratifiedKFold(n_splits=12)
    
    learn_curve = LearningCurve(b_clf_to_study, 
                                scoring='f1_weighted',
                                cv = cv,
                                train_sizes=train_sizes, 
                                n_jobs=-1)
    
    learn_curve.fit(X_train, y_train)
    learn_curve.finalize()
    learn_curve.show()
    
    try:
        save_current_fig(str(b_clf_to_study_name + '_' + 'learn_curve.jpg'))
    except:
        pass

In [None]:
if DO_BASELINE_REPORT == True & DO_CV_SCORE == True:
    
    # CV Score to see performance changes changes within a fold dramatically.
    cv = StratifiedKFold(n_splits=12)
    
    cv_scores = CVScores(b_clf_to_study, 
                                scoring='f1_weighted',
                                cv = cv)
    
    cv_scores.fit(X_train, y_train)
    cv_scores.finalize()
    cv_scores.show()
    
    try:
        save_current_fig(str(b_clf_to_study_name + '_' + 'cv_scores.jpg'))
    except:
        pass

In [None]:
if DO_BASELINE_REPORT == True & DO_MUTUAL_INFORMATION == True:
    from sklearn.feature_selection import mutual_info_classif
    
    mutual_info_rep = mutual_info_classif(X_train, y_train)
                        
    fig, ax = plt.subplots()
    
    mutual_info_df = pd.DataFrame({'feature                  m      rd':X_train.columns,
                                  'bits_shared':mutual_info_rep}).set_index('feature')
    mutual_info_df_top25 = mutual_info_df.sort_values(by = 'bits_shared',
                          ascending = False)[:25]
    
    mutual_info_df_top25.plot.barh(ax = ax)
    
    try:
        save_current_fig(str(b_clf_to_study_name + '_' + 'mutual_information.jpg'))
    except:
        pass    

** Load feature importances of the baseline so we can reduce the dimensionality with feature selection. **

In [None]:
def report_feature_importances(clf_name, classifier, X, y):
    viz = FeatureImportances(classifier)
    viz.fit(X, y)
    viz.show()
    save_current_fig(str(clf_name + 'feature_imp.jpg'))
    
    saved_feature_importances = pd.DataFrame(list(zip(viz.features_, viz.feature_importances_)))
    
    feature_importances_df = pd.DataFrame()
    feature_importances_df[str(clf_name + '_features')] = viz.features_
    
    feature_importances_df[str(clf_name + '_importances')] = viz.feature_importances_
    feature_importances_df.sort_values(inplace = True, by = str(clf_name + '_importances'), ascending = False)
    
    feature_importances_df.to_csv(str(clf_name +' feat_imp.csv'), index=False)
    
    return feature_importances_df

In [None]:
xgb_feat = xgb.XGBClassifier(reg_alpha = 10)

xgb_feat.fit(X_train, y)
xgb_feature_importances = report_feature_importances('XGBoost', xgb_feat, X_train, y)


## Reduce test_baseline to just the 100 top features. ##

In [None]:
print('top100_xgb has the filtered feature importances of X_train')
print('X_train still has all of the features')

top100_xgb = xgb_feature_importances[:100]
top100_xgb_names = top100_xgb['XGBoost_features'].reset_index(drop = True)

print('\nSample of top 25 feature importances')
print(top100_xgb_names.head(25))

# xgb_clf[1] is the classifier in the named tuples of classifiers above.
xgb_feat = xgb_clf[1]
X_top100 = X_train.loc[:,top100_xgb_names]

xgb_feat.fit(X_top100 , y)
xgb_feature_importances = report_feature_importances('XGBoost', xgb_feat, X_top100, y)

In [None]:
# Filter the best 100 features in the test set and assign to test_baseline_top100
test_baseline_top100 = test_baseline.loc[:,top100_xgb_names]
print('Top 100 features filtered in test set and assigned to test_baseline_top100')
print('test_baseline still has all the features.')

test_baseline_top100.head()

## Baseline Submission and 100 Features Submission ##

**Baseline submission with all features **

In [None]:
# Check shape of everything:
bs_X_train, bs_y_train, bs_X_test = X_train, y, X_test
print(bs_X_train.shape, bs_y_train.shape, bs_X_test.shape)


In [None]:
#Are any columns missing from the training and testing set?
print('Are any columns missing from the training and testing set?')
print(list(set(bs_X_train).difference(set(bs_X_test))))

In [None]:
# Create a submission dataframe from the best baseline.
# Fit to best 100 feature set or whole thing.
# Set the X training set to top100_xgb or X_train
# y set to y or estimated targets.

# classifier[1] is the named classifier in the named tuples
# Converted to an np.array because of 'JSON column names' problem.
baseline_submission_clf = rf_clf[1]
baseline_submission_clf.fit(bs_X_train, bs_y_train)

In [None]:
baseline_submision_preds = baseline_submission_clf.predict(bs_X_test.fillna(0)).astype(int)
print(baseline_submision_preds[:50])

**Submission with just 100 features**

In [None]:
baseline_100_submission_clf = rf_clf[1]
baseline_100_submission_clf.fit(X_top100, bs_y_train)


In [None]:
baseline_100_submision_preds = baseline_100_submission_clf.predict(test_baseline_top100.fillna(0)).astype(int)
print(baseline_100_submision_preds[:50])

## TESTING Disabled - Grid Search for LightGBM Submission ##

In [None]:
DO_GRID_SEARCH = False

In [None]:
if DO_GRID_SEARCH == True:
    from sklearn.model_selection import GridSearchCV

    param_grid = {
        'num_leaves': [31, 127],
        'reg_alpha': [0.1, 0.5],
        'min_data_in_leaf': [30, 50, 100, 300, 400],
        'lambda_l1': [0, 1, 1.5],
        'lambda_l2': [0, 1]
        }

    grid_kfold = KFold(n_splits = 3, shuffle = True, random_state = 42).split(X = X, y = y)
    gsearch = GridSearchCV(estimator = baseline_submission_clf, param_grid = param_grid, cv = grid_kfold, verbose = 2)
    lgb_model = gsearch.fit(X=X, y=y)
    
    print(lgb_model.best_params_, lgb_model.best_score_)


# Cleaning and Processing Data after Baseline #

** Create a dataframe with all labelled training data merged on installation ID and game_session **

In [None]:
MERGE_TRAIN_AND_LABELS = False

In [None]:
# If you wanted to graph things to see their relationships with the targets

if MERGE_TRAIN_AND_LABELS == True:
    annotated_train = train_labels.merge(train, on = MERGE_ON_COLS)
    annotated_train.shape

In [None]:
DO_EDA = False

In [None]:
if DO_EDA == True:
    # Create a drop function here
    # IF DO_DROP_COLUMNS == True
    # DROPPED_COLS == ["event_id", "game_session", "timestamp"]

    dropped_cols = ["game_session", "timestamp"]

    train.drop(dropped_cols, axis = 1, inplace = True)
    test.drop(dropped_cols, axis = 1, inplace = True)
    
    # Get rid of training samples that don't have labels.
    # Check https://www.kaggle.com/erikbruin/data-science-bowl-2019-eda-and-baseline
    train = train[train.index.isin(train_labels.index.unique())]
    train.shape
    
    # Is this redundant?
    annotated_train = annotated_train[annotated_train.index.isin(train_labels.index.unique())]
    annotated_train.shape

## NLP Feature Engineering ##

In [None]:
ENGINEER_NLP_FEATURES = False

In [None]:


if ENGINEER_NLP_FEATURES == True:
    event_stream = train['event_data']
    
    # Ignore these words b/c they are redundant categories.
    # What do 'coordinates' do?
    stop_word_list = ['event_code', 'game_time', 'event_count', 'game_time', 'title', 'type', 'world', 'media_type', 'audio', 'duration', 'total_duration']

    important_words = ['description', 'identifier']
    count_vec = CountVectorizer(stop_words = stop_word_list,
                               token_pattern = ':\D+:',
                               max_df = 1000,
                               min_df = 2)

    # BUG: Still catches digits and stopwords...
    # Make this into an 'apply' function or a for loop that adds this to each installation ID.
    ##### count_vec.fit_transform(event_stream)

    ##### print(count_vec.vocabulary_)
    
    # Export this for faster processing time?

## Unsupervised Exploration and Visualization ##


In [None]:
N_CLUSTERS = 4  # For kmeans and embedding.

## Semi-Supervised Confidence Labelling With LogReg, XGB ##

In [None]:
# Group all samples of test and unlabeled + labeled train under total_samples
total_samples = pd.concat([X_train, train_unlabelled, X_test])
total_samples.shape
total_index = total_samples.index

## Encode Clusters With Semi-Supervised Learning ##

In [None]:
PSEUDO_LABEL = False


In [None]:

if PSEUDO_LABEL == False:
    pass
    
if PSEUDO_LABEL == True:
    train_unlabelled_top100 = train_unlabelled.loc[:, top100_xgb_names]

    semi_preds = pd.DataFrame()
    semi_preds['installation_id'] = train_unlabelled_top100.index
    semi_preds.set_index('installation_id', inplace = True)

    lr_semi = LogisticRegression(verbose = 1)
    lr_semi.fit(X_top100, y)

    semi_preds['lr_cat'] = lr_semi.predict(train_unlabelled_top100)
    print(semi_preds.head())

    lr_semi_prob = pd.DataFrame(lr_semi.predict_proba(train_unlabelled_top100))
    lr_semi_prob['installation_id'] = train_unlabelled_top100.index
    lr_semi_prob.set_index('installation_id', inplace = True)

    lr_semi_colnames = ['lr_prob_0', 'lr_prob_1', 'lr_prob_2', 'lr_prob_3']
    lr_semi_prob.columns = lr_semi_colnames

    # print(lr_semi_prob.shape)
    print(lr_semi_prob.head())

    confidence_threshold = 0.9

    """
    # Skip Logreg
    lr_is_confident0_mask = lr_semi_prob.loc[:, 'lr_prob_0'] > confidence_threshold
    lr_confident0s = lr_semi_prob[lr_is_confident0_mask]
    print('Logistic Regression found', lr_confident0s.shape[0], 'rows with above threshold confidence for class 0')

    lr_is_confident1_mask = lr_semi_prob.loc[:, 'lr_prob_1'] > confidence_threshold
    lr_confident1s = lr_semi_prob[lr_is_confident1_mask]
    print('Logistic Regression found', lr_confident1s.shape[0], 'rows with above threshold confidence for class 1')

    lr_is_confident2_mask = lr_semi_prob.loc[:, 'lr_prob_2'] > confidence_threshold
    lr_confident2s = lr_semi_prob[lr_is_confident2_mask]
    print('Logistic Regression found', lr_confident2s.shape[0], 'rows with above threshold confidence for class 2')

    lr_is_confident3_mask = lr_semi_prob.loc[:, 'lr_prob_3'] > confidence_threshold
    lr_confident3s = lr_semi_prob[lr_is_confident3_mask]
    print('Logistic Regression found', lr_confident3s.shape[0], 'rows with above threshold confidence for class 3')
    """

    # Predict for labels
    xgb_semi = xgb.XGBClassifier(verbose = 1, reg_alpha = 5)
    xgb_semi.fit(X_top100, y)

    xgb_semi_preds = xgb_semi.predict(train_unlabelled_top100)
    print(xgb_semi_preds[:5])

    xgb_semi_prob = pd.DataFrame(xgb_semi.predict_proba(train_unlabelled_top100))
    xgb_semi_prob['installation_id'] = train_unlabelled_top100.index
    xgb_semi_prob.set_index('installation_id', inplace = True)

    xgb_semi_colnames = ['xgb_prob_0', 'xgb_prob_1', 'xgb_prob_2', 'xgb_prob_3']
    xgb_semi_prob.columns = xgb_semi_colnames
    xgb_semi_prob.head()

    confidence_threshold = 0.9

    xgb_is_confident0_mask = xgb_semi_prob.loc[:, 'xgb_prob_0'] > confidence_threshold
    xgb_confident0s = xgb_semi_prob[xgb_is_confident0_mask]
    print('XGBoost found', xgb_confident0s.shape[0], 'rows with above threshold confidence for class 0')

    xgb_is_confident1_mask = xgb_semi_prob.loc[:, 'xgb_prob_1'] > confidence_threshold
    xgb_confident1s = xgb_semi_prob[xgb_is_confident1_mask]
    print('XGBoost found', xgb_confident1s.shape[0], 'rows with above threshold confidence for class 1')

    xgb_is_confident2_mask = xgb_semi_prob.loc[:, 'xgb_prob_2'] > confidence_threshold
    xgb_confident2s = xgb_semi_prob[xgb_is_confident2_mask]
    print('XGBoost found', xgb_confident2s.shape[0], 'rows with above threshold confidence for class 2')

    xgb_is_confident3_mask = xgb_semi_prob.loc[:, 'xgb_prob_3'] > confidence_threshold
    xgb_confident3s = xgb_semi_prob[xgb_is_confident3_mask]
    print('XGBoost found', xgb_confident3s.shape[0], 'rows with above threshold confidence for class 3')

    print(xgb_confident0s.head())
    print(xgb_confident3s.head())

    # Define new X_train style dataframes with just the confident threes and zeroes
    confident_zeroes = train_unlabelled.loc[xgb_confident0s.index, top100_xgb_names]
    confident_threes = train_unlabelled.loc[xgb_confident3s.index, top100_xgb_names]
    print(confident_zeroes.shape)
    print(confident_threes.shape)

    # Concatenate  0's
    X_train = pd.concat([X_top100, confident_zeroes], axis = 0)
    X_train.shape

    # Concatenate 3's
    X_train = pd.concat([X_train, confident_threes], axis = 0)
    X_train.shape

    print(y.shape)
    y = pd.DataFrame(y)
    print(y.head())

    y_confident_zeroes = pd.DataFrame(confident_zeroes.index)
    y_confident_zeroes.set_index('installation_id', inplace = True, drop = True)
    y_confident_zeroes['accuracy_group'] = 0

    print(y_confident_zeroes.shape)
    print(y_confident_zeroes.head())

    y = pd.concat([y, y_confident_zeroes], axis = 0)
    print(y.shape)
    print(y.head())

    y_confident_threes = pd.DataFrame(confident_threes.index)
    y_confident_threes.set_index('installation_id', inplace = True, drop = True)
    y_confident_threes['accuracy_group'] = 3
    print(y_confident_threes.shape)

    y = pd.concat([y, y_confident_threes], axis = 0)
    print(y.shape)
    print(y.head())
    print(X_train.shape)

In [None]:
if EMBED_TEST_FIT_CLUSTER_ON_TRAIN_FEATURE == True:
    # Create a function for this
    
    # Store the X index
    X_index = X.index
    test_index = test_baseline.index
    # Does labelling the predictions lead to better models?

    # Scale the testing data
    test_scaled = MinMaxScaler()
    test_scaled = test_scaled.fit_transform(test_baseline.fillna(0))

    # The training data is already scaled at 

    # Fit kmeans to the scaled test data.
    kmeans_test = KMeans(n_clusters = N_CLUSTERS, random_state=0)
    kmeans_test.fit(test_scaled)

    # Predict the clusters that would result with characteristics similar to the training set.
    kmeans_preds_on_train = kmeans_test.predict(X_scaled)

    # Categorically encode these categories.
    kmeans_preds_cat_encoded_train = pd.get_dummies(kmeans_preds_on_train, prefix='kmeans_')
    kmeans_preds_cat_encoded_train.index = X_index

    # Do the same thing with the test set.
    kmeans_preds_on_test = kmeans_test.predict(test_scaled)

    kmeans_preds_cat_encoded_test = pd.get_dummies(kmeans_preds_on_test, prefix='kmeans_')
    kmeans_preds_cat_encoded_test.index = test_index
    
    train = pd.concat([X, kmeans_preds_cat_encoded_train], axis = 1)
    test = pd.concat([test_baseline, kmeans_preds_cat_encoded_test], axis = 1)
    
    train.head()

### MinMax Scale Data for Clustering (Hierarchical, Kmeans, Other) ###

In [None]:
# Reassign inputs for unsupervised learning: 
X_train, y_train, X_test = X_train, y, test_baseline
print(X_train.shape, y_train.shape, X_test.shape)

In [None]:
# Scale the data for cluster analysis
train_scaled = MinMaxScaler()
train_scaled = train_scaled.fit_transform(X_train)

# Scale the testing data.
# test_preprocessed = X_test.fillna(0).astype(int)

test_scaled = MinMaxScaler()
test_scaled = test_scaled.fit_transform(X_test)

# Scale everything
total_samples_scaled = MinMaxScaler()
total_samples_scaled = total_samples_scaled.fit_transform(total_samples)

### Hierarchical Clustering Exploration ###

In [None]:
if EXPLORE_WITH_HIERARCHICAL_C == True:
    from scipy.cluster.hierarchy import dendrogram

    def get_hc_distances(X, verbose = 0):
        distances = linkage(X, method="centroid", metric="euclidean")
        return distances

    def plot_dendrogram(distances):
        dn = dendrogram(distances)

    def create_hc_clusters(distances):
        hc_clusters = fcluster(distances, 4, criterion="distance")
        return hc_clusters

    if EXPLORE_WITH_HIERARCHICAL_C == True:
        distances = get_hc_distances(train_scaled_eda)
        plot_dendrogram(distances)

        # Save the figure
        save_current_fig('Dendrogram.jpg')

        # Clusters not working yet.
        hc_clusters = create_hc_clusters(distances)
        hc_clusters[:50]
    

### Feature Selection ###

* Use principal component analysis (PCA) to determine features that explain the most variance.
* Use Nonlinear PCA
* Use random forests to determine feature importances.

## PCA ##

In [None]:
def scatterplot_of_2pca(X, X_pca, labels, use_matplotlib = True, use_plotly = False):
    X_pca = pd.DataFrame(X_pca, index = X.index)

    if use_matplotlib == True:
        plt.scatter(x=X_pca.loc[:,0], y=X_pca.loc[:,1], c = labels, cmap = 'RdBu')
        plt.title('Scatterplot of 2 highest principal components')
        plt.show()

    if use_plotly == True:
        fig = px.scatter(X_pca.loc[:,0], y = X_pca.loc[:,1], color = y_train)
        fig.update_layout(title = 'Scatterplot of 2 highest principal components')
        fig.show()
    
    save_current_fig('Scattplot_2_pcas_train.jpg')

**PCA on everything**

In [None]:
DO_PCA = False

In [None]:

if DO_PCA == True:
    PCA_N_COMPONENTS = 100

    # Instantiate components of the pipeline
    scaler = StandardScaler()
    pca = PCA(n_components = PCA_N_COMPONENTS)

    # Make and fit the pipeline
    pipeline = make_pipeline(scaler, pca)
    pipeline.fit(total_samples.fillna(0))   

    total_pca = pca.transform(total_samples_scaled)

    # Show the explained variances of the PCA features.
    features = range(pca.n_components_)
    plt.bar(features, pca.explained_variance_ratio_)
    plt.title("Principal Component Analysis of Total Set")
    plt.xlabel('PCA feature')
    plt.ylabel('variance')
    plt.xticks(features)
    plt.show()

    print('Total variance with', PCA_N_COMPONENTS, 'components is', pca.explained_variance_ratio_[:PCA_N_COMPONENTS].sum())
    save_current_fig("Principal_Component_Analysis_Total_Set.jpg")
    
    scatterplot_of_2pca(total_samples, total_pca, labels = None)
    
    total_pca_df = pd.DataFrame(total_pca, index = total_index)

    X_tr_pca_labelled = total_pca_df.loc[train_y_baseline.index.unique(), :]
    X_tr_pca_unlabelled = total_pca_df.loc[train_unlabelled.index.unique(), :]
    X_test_pca = total_pca_df.loc[test_baseline.index.unique(), :]
    
    X_tr_pca_labelled.head()

**Predict accuracy group on unlabelled data**

In [None]:
if DO_PCA == True:
    pca_xgb = xgb.XGBClassifier()
    pca_xgb.fit(X_tr_pca_labelled, y)
    
    unlabelled_pca_preds = pca_xgb.predict(X_tr_pca_unlabelled)
    unlabelled_preds_df = pd.DataFrame(unlabelled_pca_preds, index = train_unlabelled.index, columns = ['accuracy_group'])
    unlabelled_preds_df.head()
    
    # Concatenate the unlabelled preds to total_y

**PCA on the training set **

In [None]:
if DO_PCA == True:
    PCA_N_COMPONENTS = 100

    # Instantiate components of the pipeline
    scaler = StandardScaler()
    pca = PCA(n_components = PCA_N_COMPONENTS)

    # Make and fit the pipeline
    pipeline = make_pipeline(scaler, pca)
    pipeline.fit(X_train.fillna(0))   # On the whole thing or just the training set?

    X_train_pca = pca.transform(train_scaled)

    # Show the explained variances of the PCA features.
    features = range(pca.n_components_)
    plt.bar(features, pca.explained_variance_ratio_)
    plt.title("Principal Component Analysis of Train Set")
    plt.xlabel('PCA feature')
    plt.ylabel('variance')
    plt.xticks(features)
    plt.show()

    print('Total variance with', PCA_N_COMPONENTS, 'components is', pca.explained_variance_ratio_[:PCA_N_COMPONENTS].sum())
    save_current_fig("Principal_Component_Analysis_Train_Set.jpg")
    
    # This Decodes the encoded compressed PCA information.  You can use this for your models.
    X_train_PCA_inverse = pca.inverse_transform(X_train_pca)
    X_train_PCA_inverse = pd.DataFrame(data = X_train_PCA_inverse, index = X_train.index)
    
    scatterplot_of_2pca(X_train, X_train_pca, y_train)

**PCA on the test set**

In [None]:
# Instantiate components of the pipeline
DO_PCA_ON_TEST = False

if DO_PCA_ON_TEST == True:
    scaler = StandardScaler()
    pca = PCA(n_components = PCA_N_COMPONENTS)

    # Make and fit the pipeline
    pipeline = make_pipeline(scaler, pca)

    pipeline.fit(X_test.fillna(0))

    X_test_pca = pca.transform(X_test.fillna(0))

    # Show the explained variances of the PCA features.
    features = range(pca.n_components_)
    plt.bar(features, pca.explained_variance_)
    plt.title("Principal Component Analysis of Test Set")
    plt.xlabel('PCA feature')
    plt.ylabel('variance')
    plt.xticks(features)
    plt.show()

    print('Total variance with', PCA_N_COMPONENTS, 'components is', pca.explained_variance_ratio_[:PCA_N_COMPONENTS].sum())
    save_current_fig("Principal_Component_Analysis_Test_Set.jpg")

    scatterplot_of_2pca(X_test, X_test_pca, None)

    X_test_PCA_inverse = pca.inverse_transform(X_test_pca)
    X_test_PCA_inverse = pd.DataFrame(data = X_test_PCA_inverse, index = X_test.index)

    fit_baseline_classifiers(tree_classifiers, X_train_pca, y)


### Kernel PCA ###

In [None]:
DO_KERNEL_PCA = True
PCA_N_COMPONENTS = 50

if DO_KERNEL_PCA == True:
    from sklearn.decomposition import KernelPCA
    
    def do_kernel_pca():
        pass
    
    #Apply kernel to train
    
    kernel_pca_train = KernelPCA(n_components = PCA_N_COMPONENTS, kernel = 'linear', fit_inverse_transform = True)
    
    kernel_pca_train_arr = kernel_pca_train.fit_transform(train_scaled)
    kernel_pca_train_df = pd.DataFrame(data = kernel_pca_train_arr, index = X_train.index)
    
    kernel_pca_train_inverse = kernel_pca_train.inverse_transform(kernel_pca_train_arr)
    kernel_pca_train_inverse_df = pd.DataFrame(data = kernel_pca_train_inverse, index = X_train.index)
    
    # Apply kernel to test
    kernel_pca_test = KernelPCA(n_components = PCA_N_COMPONENTS, kernel = 'rbf', fit_inverse_transform = True)
    
    kernel_pca_test_arr = kernel_pca_test.fit_transform(test_scaled)
    kernel_pca_test_df = pd.DataFrame(data = kernel_pca_test_arr, index = X_test.index)
    
    kernel_pca_test_inverse = kernel_pca_test.inverse_transform(kernel_pca_test_arr)
    kernel_pca_test_inverse_df = pd.DataFrame(data = kernel_pca_test_inverse, index = X_test.index)

In [None]:
print(X_train.shape)
print(kernel_pca_train_df.shape)
print(y.shape)

In [None]:
if DO_KERNEL_PCA == True:
    scatterplot_of_2pca(X = X_train, 
                        X_pca = kernel_pca_train_df, 
                        labels = y)

In [None]:
print(kernel_pca_test_df.shape)
print(X_test.shape)

In [None]:
if DO_KERNEL_PCA == True:
    scatterplot_of_2pca(X = X_test, 
                        X_pca = kernel_pca_test_df, 
                        labels = None)

In [None]:
if DO_KERNEL_PCA == True:
    fit_baseline_classifiers(tree_classifiers, kernel_pca_train_df, y)
    kernel_pca_train_df.head(2)

### Create submission file with nonlinear pca since it did well? ###

In [None]:
xgb_pca = xgb.XGBClassifier()
xgb_pca.fit(kernel_pca_train_df, y)

In [None]:
pca_preds = xgb_pca.predict(kernel_pca_test_df)
pca_preds[200:600]


## Manifold Visualizations After PCA ##

### T-SNE Grid Search Experiment ###

In [None]:
EXPLORE_WITH_TSNE = False

if EXPLORE_WITH_TSNE == True:
    from sklearn.manifold import TSNE
    
    tsne_model = TSNE(learning_rate = 250, verbose = 1, perplexity = 500)
    print(tsne_model)
    
    tsne_transformed = tsne_model.fit_transform(kernel_pca_train_df)
    
    # Define the x and y axes of the TSNE plot.
    tsne_xs = tsne_transformed[:,0]
    tsne_ys = tsne_transformed[:,1]

    # Plot the TSNE
    plt.scatter(tsne_xs, tsne_ys, c = y.values, cmap = 'RdBu')
    plt.title('T-SNE Clusters of Training Dataset - Axes do not have meaning')
    plt.show()
    save_current_fig('T_SNE_of_Training_Dataset.jpg')

In [None]:
if EXPLORE_WITH_TSNE == True:
    from sklearn.manifold import TSNE
    
    tsne_model = TSNE(learning_rate = 250, verbose = 1, perplexity = 500)
    print(tsne_model)
    
    tsne_transformed = tsne_model.fit_transform(total_pca)
    
    # Define the x and y axes of the TSNE plot.
    tsne_xs = tsne_transformed[:,0]
    tsne_ys = tsne_transformed[:,1]

    # Plot the TSNE
    plt.scatter(tsne_xs, tsne_ys)
    plt.title('T-SNE Clusters of All datapoints - Axes do not have meaning')
    plt.show()
    save_current_fig('T_SNE_of_Total_Dataset.jpg')

In [None]:
EXPLORE_WITH_KMEANS = True

from sklearn.metrics.cluster import homogeneity_score

def explore_with_kmeans(X, data_name, ground_truth = None, n_clusters = N_CLUSTERS):
    # Scale before using with minmax or standardscaler.
    
    kmeans = KMeans(n_clusters = n_clusters, random_state=0)
    kmeans.fit(X)
    kmeans_preds = kmeans.predict(X)

    clusters = kmeans_preds

    # KMeans Distribution
    # Be nice if this was normalized?
    
    kmeans_filename = 'Count_Plot_'+ str(data_name) + '_Cluster_Counts.jpg'
    
    sns.countplot(x = clusters).set_title(kmeans_filename)
    plt.plot()
    
    save_current_fig(kmeans_filename)
    
    print('KMEANS Intertia of', data_name, ':', kmeans.inertia_)
    # ADD - Write intertia to report.
    
    # Homogeneity of classes y (ground_truth) is given
    try:
        print('Homogeneity score is:', homogeneity_score(ground_truth, kmeans_preds))
    except:
        pass



In [None]:
explore_with_kmeans(X = kernel_pca_train_df, 
                    data_name = 'Train', 
                    ground_truth = y_train, 
                    n_clusters = 4)

In [None]:
explore_with_kmeans(X = kernel_pca_test_df, 
                    data_name = 'Test')

### Train Test Splits - Several Splits to test Comparative Accuracy ##

In [None]:
# Which train and test set would you like to use?
# Use train and test for "original" dataframe
# Use X_train_poly and X_test_poly for polynomial features.
# X_train = train
# X_test = test

y_train = y_train[y_train.index.isin(X_train.index.unique())]
y_train.shape

print('Make sure features have good shape')
print(X_train.shape, '\n', y_train.shape, '\n', X_test.shape)

In [None]:
# Train test splitting.
# Use this if doing original dataframe
# If you use this you must retrain classifiers from scratch so as not to overfit.

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train, y_train, stratify=y_train, shuffle = True)
print('Splits', X_train_s.shape, y_train_s.shape, X_test_s.shape, y_test_s.shape)


### Fit Models After Feature Selection, Semi-Supervised Labelling, etc. ###

### Instantiate All Classifiers and Fit ###

Feature elimination and tpot automl

In [None]:
# Feature Selection + Hyperparameter Tuning Functions

# Recursive Feature Elimination... Return Best Features
def recursive_feature_elimination(classifier, X_train, y_train):
    rec_feat_elim = RFECV(classifier, C=1)
    rec_feat_elim.fit(X_train, y_train) 
    rec_feat_elim.show()

# Bayesian Hyperparamter Tuning

# TPOT AutoML  Tuning
# Add y_test as a parameter when doing train_test splits and validation.
def tpot_automl(X_train, y_train, X_test, generations=4, population_size=20, verbosity=3):
    tpot = TPOTClassifier(generations=generations, 
                          population_size=population_size, 
                          cv=5,
                          random_state=42, 
                          verbosity=verbosity)
    
    tpot.fit(X_train, y_train)
    
    # tpot.score(X_test, y_test)
    
    tpot_preds = tpot.predict(X_test)
    
    print(tpot.fitted_pipeline_)
    tpot.export('tpot_pipeline.py')
    
    return tpot_preds

In [None]:
# This is just the instances, we will have to refit them to the splits to not overfit.
USE_SAME_CLASSIFIERS_FOR_END = True

if USE_SAME_CLASSIFIERS_FOR_END == True:
    pass
else:
    # Random Forest Classifier
    rf_clf = RandomForestClassifier(n_estimators = 500,
                                   max_depth = 7,
                                   class_weight='balanced',
                                   n_jobs=-1)

    # XGBoost Classifier
    # xgb_data_matrix = xgb.DMatrix(data = X_train_s, label = y_train_s)
    xgb_clf = xgb.XGBClassifier(n_jobs=-1,
                                num_feature = 30,
                                nfold=5)

    # LightGBM Classifier
    lb_clf = lgb.LGBMClassifier(min_gain_to_split = 0.9,
                                             objective = 'multiclass',
                                             is_unbalance = True,
                                             lambda_l1 = 8)

    # Catboost Classifier
    cb_clf = cb.CatBoostClassifier(verbose=0)

    #Bagging Classifier
    bag_clf = BaggingClassifier(base_estimator=cb_clf, n_estimators=10, random_state=0, n_jobs=-1)

    #TPOT Classifier

    # Old List of classifiers
    ## classifiers = [('Random Forest', rf_clf), ('XGBoost', xgb_clf), ('Bagging Classifier', bag_clf)]

    # Just the three best.
    classifiers = [('XGBoost', xgb_clf), ('LightGBM', lgb_clf), ('Catboost', cb_clf)]
    
    # Empty list to return predictions
    preds = pd.DataFrame()

**Stratified KFold Loop**

In [None]:
def kfold_loop(classifiers, X_train, y_train):
    str_kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=123)
    fold = 0
    
    for train_index, test_index in str_kf.split(X_train, y_train):
        # Obtain training and testing folds
        
        cv_X_train, cv_X_test = X_train.iloc[train_index], X_train.iloc[test_index]
        cv_y_train, cv_y_test = y_train.iloc[train_index], y_train.iloc[test_index]
        
        print('Fold: {}'.format(fold))
        print('CV train shape: {}'.format(cv_X_train.shape))
        
        fit_classifiers(classifiers, cv_X_train, cv_y_train, cv_X_test, cv_y_test)
        
        fold += 1


In [None]:
# kfold_loop(classifiers, X_train, y_train)

** Fit to Experimental Validation Sets**

In [None]:
# Run Functions Without Polynomail Features
# Remove 'baseline' from classifier name.

print('Running classifiers after feature modifications, compared to baseline')
fit_baseline_classifiers(tree_classifiers, np.array(X_train_s), y_train_s)


### Lightboost Feature Importances ###

In [None]:
LB_FEAT_IMP = False

if LB_FEAT_IMP == True:  
    lb_clf_feat = lgb.LGBMClassifier(min_gain_to_split = 0.9,
                                                 objective = 'multiclass',
                                                 is_unbalance = True,
                                                 lambda_l1 = 8)

    lb_clf_feat.fit(X_train_s, y_train_s)
    lgb.plot_importance(lb_clf_feat, max_num_features = 50, figsize = (20,20), title = 'Feature Importance of LightGBM Model')
    save_current_fig('Feature_importance_of_lightGBM')

### Plot Decision Regions with several high-importance variables ###

In [None]:
def plot_2d_decision_regions(X, y, clf):
    return plot_decision_regions(X = X, y = y , clf = clf)

In [None]:
# Load the two variables

var1 = 'accuracy'
var2 = 'accuracy_min'

X_plot_decision = pd.DataFrame(X_train_s[var1].fillna(0))  # Create first variabel column
X_plot_decision[var2] = X_train_s[var2].fillna(0)  # Create second variable column

X_plot_decision = np.array(X_plot_decision)
y_plot_decision = np.array(y_train_s)

xgb_dec_clf = xgb.XGBClassifier()
xgb_dec_clf.fit(X_plot_decision, y_train_s)

# See if two dimensions is enough to converge on a decision region.
ax = plot_2d_decision_regions(X_plot_decision, y_plot_decision, xgb_dec_clf)
# print(accuracy_score())
plt.title('Decision Regions of ' + str(var1) + ' and ' + str(var2))
save_current_fig('Decision_Region.jpg')

**Metrics Helper Functions**

In [None]:
# Reporting Functions
def display_confusion_matrix(classifier, X_train, y_train, X_test, y_test):
    try:
        cm = ConfusionMatrix(classifier)
        cm.fit(X_train, y_train)
        cm.score(X_test, y_test)
        cm.show() 
    except:
        pass


def display_classification_report(classifier, X_train, y_train, X_test, y_test):
    try:
        cr = ClassificationReport(classifier)
        cr.fit(X_train, y_train)
        cr.score(X_test, y_test)
        cr.show()
    except:
        pass
    
def display_ROC(classifier, X_train, y_train, X_test, y_test):
    try:
        roc = ROCAUC(classifier)
        roc.fit(X_train, y_train)
        roc.score(X_test, y_test)
        roc.show()
    except:
        pass
    
def display_feature_importances(classifier, X_train, y_train):
    try:
        feature_imp = FeatureImportances(classifier)
        feature_imp.fit(X_train, y_train)
        print(list(feature_imp.features_))
    except:
        pass

In [None]:
# Display important reportings here
# Ideally modify functions for subplots

def report_all(classifiers, X_train_display, y_train_display, X_test_display, y_test_display):
    for clf_name, classifier in classifiers:

        display_confusion_matrix(classifier, X_train_display, y_train_display, X_test_display, y_test_display)
        display_classification_report(classifier, X_train_display, y_train_display, X_test_display, y_test_display)
        display_ROC(classifier, X_train_display, y_train_display, X_test_display, y_test_display)

        # Not all classifiers will have this.
        try:
            display_feature_importances(display_feature_importances, X_train_display, y_train_display)
        except:
            print('did not compute feature importance for: ', clf_name)

In [None]:
# Set the report to the features and then display
X_train_display, y_train_display, X_test_display, y_test_display = X_train_s, y_train_s, X_test_s, y_test_s

# Report all regular features
## Remove Catboost b/c it is causing problems in the reporting.
classifiers_report = tree_classifiers.copy()

report_all(classifiers_report, X_train_display, y_train_display, X_test_display, y_test_display)

**Reporting code - Consider removing? **

In [None]:
# Set the report to the regular features and then display
## X_train_display, y_train_display, X_test_display, y_test_display = X_train_s, y_train_s, X_test_s, y_test_s

# Report all regular features
# This throws an error because I fit the same classifiers and now they have different features.
# I'd have to run this before and after I fit them unless I have the functions return values.

## report_all(classifiers, X_train_display, y_train_display, X_test_display, y_test_display)

## Deep Learning ##

In [None]:
USE_DEEP_LEARNING = True

In [None]:
if USE_DEEP_LEARNING == True:
    from keras.utils import to_categorical
    from keras import models, layers
    from keras.callbacks import EarlyStopping, ModelCheckpoint
    from keras.optimizers import SGD
    
    print('Splits', X_train_s.shape, y_train_s.shape, X_test_s.shape, y_test_s.shape)

    # DL Parameters
    var_input_shape = (X_train_s.shape[1], )
    layer_num = 400
    activation = 'relu'
    epochs = 40
    batch_size = 20
    
    # For scaling
    X_train_s_imp = X_train_s.fillna(0)
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_s_imp)
    X_train_dl = X_train_scaled
    print('Scaled the X training split')
    print(X_train_dl[1][:5])

    X_test_s_imp = X_test_s.fillna(0)
    scaler_test = MinMaxScaler()
    X_test_scaled = scaler_test.fit_transform(X_test_s_imp)
    X_test_dl = X_test_scaled
    print('Scaled the X test split')
    print(X_test_dl[1][:5])


    # To_Categorical so output shape is (n,4)
    y_train_dl = to_categorical(np.array(y_train_s))
    print('Categorically encoded targets of training data.')
    print(y_train_dl[1])

    y_test_dl = to_categorical(np.array(y_test_s))
    print('Categorically encoded targets of testing data.')
    print(y_test_dl[1])

    print('Splits', X_train_dl.shape, y_train_dl.shape, X_test_dl.shape, y_test_dl.shape)



    # # This standard model has 3 layers with ____ nodes with 
    # a dropout layer that gets rid of 50% of the learning
    # To encourage the learning to 'spread' throughout
    # the network.
    

    network = models.Sequential()

    network.add(layers.Dense(layer_num, input_shape = var_input_shape, activation='relu'))
    network.add(layers.Dropout(0.5))
    network.add(layers.Dense(layer_num, activation='relu'))
    network.add(layers.Dropout(0.5))
    network.add(layers.Dense(layer_num, activation='relu'))
    network.add(layers.Dropout(0.5))
    network.add(layers.Dense(layer_num, activation='relu'))
    network.add(layers.Dropout(0.5))
    network.add(layers.Dense(layer_num, activation='relu'))
    network.add(layers.Dropout(0.5))
    network.add(layers.Dense(layer_num, activation='relu'))
    network.add(layers.Dropout(0.5))

    # Final layer
    network.add(layers.Dense(4, activation='softmax'))

    sgd = SGD(lr=0.005, momentum=0.4, nesterov=True)
    
    # Categorical cross entropy for multi-class problems
    network.compile(optimizer = sgd,
                   loss = 'categorical_crossentropy',
                   metrics=['accuracy'])

    # early_stopping = EarlyStopping(monitor='val_accuracy', patience = 8)

    model_save = ModelCheckpoint('best_model.hdf5',
                                save_best_only=True)

    dl_history = network.fit(X_train_dl, 
                             y_train_dl, 
                             epochs = epochs, 
                             batch_size=batch_size, 
                             verbose=2,
                            validation_data=(X_test_dl, y_test_dl),
                            callbacks = [model_save])

In [None]:
if USE_DEEP_LEARNING == True:
    plt.figure()
    plt.plot(dl_history.history['accuracy'])
    plt.plot(dl_history.history['val_accuracy'])
    plt.title('Keras Model Accuracy on Training set and Validation Set')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(['Train','Test'])
    plt.show()

    save_current_fig('Keras_Model_Acc_Training_vs_validation_set.jpg')

** Preprocess Submission X_test for Neural Network**

In [None]:
if USE_DEEP_LEARNING == True:
    # The submission file has more training samples,
    # So should do better.  Process that.
    # Process the submission training set.

    X_train_submit_dl = X_train.fillna(0)
    scaler_train_submit = MinMaxScaler()
    X_submit_scaled = scaler_train_submit.fit_transform(X_train_submit_dl)
    print('Scaling X of the entire TRAINING set for ensembling with neural network')
    print(X_submit_scaled[1][:5])


    X_test_submit_dl = X_test.fillna(0)
    test_scaler_submit = MinMaxScaler()
    X_test_submit_scaled = test_scaler_submit.fit_transform(X_test_submit_dl)
    print('Scaling X of the entire TEST set for ensembling with neural network')
    print(X_test_submit_scaled[1][:5])


    y_train_submit_dl = to_categorical(np.array(y_train))
    print('Categorically encode targets of ALL the training set.')
    print(y_train_submit_dl[1])


    layer_num = layer_num

    network_submit = models.Sequential()

    network_submit.add(layers.Dense(layer_num, input_shape = var_input_shape, activation='relu'))
    network_submit.add(layers.Dropout(0.5))
    network_submit.add(layers.Dense(layer_num, activation='relu'))
    network_submit.add(layers.Dropout(0.5))
    network_submit.add(layers.Dense(layer_num, activation='relu'))
    network_submit.add(layers.Dropout(0.5))
    network_submit.add(layers.Dense(layer_num, activation='relu'))
    network_submit.add(layers.Dropout(0.5))
    network_submit.add(layers.Dense(layer_num, activation='relu'))
    network_submit.add(layers.Dropout(0.5))
    network_submit.add(layers.Dense(layer_num, activation='relu'))

    # Final layer
    network_submit.add(layers.Dense(4, activation='softmax'))

    # Categorical cross entropy for multi-class problems
    network_submit.compile(optimizer = sgd,
                   loss = 'categorical_crossentropy',
                   metrics=['accuracy'])

    # early_stopping = EarlyStopping(monitor='val_accuracy', patience = 8)

    model_save = ModelCheckpoint('best_model.hdf5',
                                save_best_only=True)

    network_submit.fit(X_submit_scaled, 
                       y_train_submit_dl, 
                       epochs = epochs, 
                       batch_size=batch_size, 
                       verbose=2,
                        callbacks = [model_save])

    print(X_test_submit_scaled[1][:5])

In [None]:
if USE_DEEP_LEARNING == True:
    dl_preds = network_submit.predict_classes(X_test_submit_scaled)
    dl_preds.shape

    model_prediction['NN_preds'] = dl_preds
    model_prediction.head()

    print(model_prediction.head(100))

### Post Submission Reports for Modification ###

# File Submission #

In [None]:
# Define the best results here for the submission
# This should NOT be the splits, but the whole thing.
X_train_submit, y_train_submit, X_test_submit = X_train, y_train, X_test

# CHOOSE WHICH SUBMISSION PREDICTIONS TO USE HERE
final_submission_preds = dl_preds
final_submission_preds_name = "Neural Network Preds"

print(X_train_submit.shape, y_train_submit.shape, X_test_submit.shape)

# Create the empty dataframe and copy the indices of the test set.
model_prediction = pd.DataFrame()
model_prediction['installation_id'] = X_test_submit.index.astype(str)

print(model_prediction.shape)

# Choose the final model here
submission_model = lgb.LGBMClassifier(min_gain_to_split = 0.9,
                                     objective = 'multiclass',
                                     is_unbalance = True,
                                     lambda_l1 = 8)

# Fit the model and create the predictions.
submission_model.fit(np.array(X_train_submit), y_train_submit)

# IF FITTING HERE: Predict on the test set from the fitted model.
# Otherwise use another pred output from a different model.
# like baseline_preds or PCA_preds.

# MODEL
# submission_preds = submission_model.predict(X_test_submit)

submission_preds = final_submission_preds

# load this into the submission dataframe. 
model_prediction['accuracy_group'] = submission_preds.astype(int)

# Create a csv file out of these predictions.
model_prediction.to_csv('submission.csv', index=False)

# Confirm everythin looks ok.
print('Submission file created from', final_submission_preds_name)
print(model_prediction.head(30))
print(model_prediction.shape)
print(model_prediction.info())
print(model_prediction['accuracy_group'].value_counts())

# Check output file was created in right spot.
import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))