In [None]:
'''General Header for Python Operations'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import warnings
import os

# set graphics and print options
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (15,20)
pd.set_option('precision', 3)
np.set_printoptions(precision=3)

# hide warnings
warnings.filterwarnings('ignore')

# print input files for dataset
files_dict = {}
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files_dict[filename.split('.')[0]] = os.path.join(dirname, filename)
        print(files_dict[filename.split('.')[0]])

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

- train.csv - the training data, one product (id) per row, with the associated features (feature_*) and class label (target)

- test.csv - the test data; you must predict the probability the id belongs to each class

- sample_submission.csv - a sample submission file in the correct format

In [None]:
# Import Datasets
data_dict = {}
for file in files_dict.keys():
    data_dict[file] = pd.read_csv(files_dict[file])


In [None]:
#Examine Data
for df in data_dict.keys():
    print(f'\n {df}')
    display(data_dict[df].head())
    display(data_dict[df].info())

In [None]:
# Check For NaN's
train_df = data_dict['train']
print('Any Features with NaN?')
any(train_df.isna().sum() > 0)

# EDA

In [None]:
# Check Balance of training set classes
train_df.groupby('target').describe()

The classes are highly imbalanced. Classes 3 and 4 are extremely under represented. Models where initially built having undersampled the majorities to even out the data set but resulted in a data set that was too small. So, training should be done on the data as is.

In [None]:
# check feature cadinality
for col in train_df.columns:
    print(f'{col}: {train_df[col].nunique()} unique values.')

In [None]:
# Look at descriptive stats for each column
for col in train_df.columns[:-1]:
    tmp = train_df[col].describe()
    tmp = [tmp['mean'],tmp['50%'],tmp['std'],tmp['min'],tmp['max']]
    print(f'{col}: Mean {tmp[0]:0.4f}, Med: {tmp[1]:0.4f}, Std: {tmp[2]:0.4f}, Range: ({tmp[3]}, {tmp[4]})')

All values are non-negative and appear to be badly skewed right

In [None]:
# check normality of features
for col in train_df.columns[:-1]:
    print(f'{col}: SW test p-value = {stats.shapiro(train_df[col]).pvalue}')

None of the features are normally distributed

In [None]:
# Examine Hist of all features
_= train_df[train_df.columns[1:-1]].hist(figsize=(15,40), layout=(15,5), bins=40)

As expected, the data is all skewed right, so we'll transform it via Box-Cox method.

In [None]:
# Examine Hist of all features with power transforms and iterate to find best by hand
_= train_df[train_df.columns[1:-1]].pow(1/2.).hist(figsize=(15,40), layout=(15,5), bins=40)

None of them look great, but the square root is as good as any other transform.

Looking at correlation between features...

In [None]:
# Examine intra-feature correlations
m = train_df[train_df.columns[1:-1]].corr()
msk = np.triu(np.ones_like(m, dtype=bool))
plt.figure(figsize=(20,20))
_=sns.heatmap(m, mask = msk, cmap = 'coolwarm', annot=False, cbar=False)


Looks like there are some highly correlated features, so we'll look at performing pca.

In [None]:
# Examine singular values
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer

sclr = PowerTransformer(method='yeo-johnson', standardize=True)

pca = PCA().fit(sclr.fit_transform(train_df[train_df.columns[1:-1]]))
plt.figure(figsize=(10,7))
plt.plot(np.arange(1,len(train_df.columns[1:-1])+1),np.cumsum(pca.explained_variance_ratio_))
plt.hlines(0.95, *plt.xlim(), colors='k', linestyles='dotted', alpha = 0.5)
plt.ylabel('Cumulative Explained Var Ratio')
plt.xlabel('Number of Components')
_= plt.title('PCA of Raw Features', fontweight='bold')

There are roughly 68 components required to explain 95% of the variance in the data, so we'll use PCA to eliminate the extra feature count.

In [None]:
# Find number of components to explain 95% of var
pca = PCA(n_components=0.95).fit(sclr.fit_transform(train_df[train_df.columns[1:-1]]))
print(f'95% Var Number of Components: {pca.n_components_}, Explained Variance: {np.sum(pca.explained_variance_ratio_)}')

# Prepare Data for ML

### Cap data since it's all non-negative and skrewed right

In [None]:
# Cap data at 99th percentile
train_df[train_df.columns[1:-1]].clip(upper = train_df[train_df.columns[1:-1]].quantile(0.99), axis = 1, inplace = True)

### Encode the Target

In [None]:
# Encode Target variable
from sklearn.preprocessing import LabelEncoder
target_le = LabelEncoder()
train_df['target_enc'] = target_le.fit_transform(train_df['target'])

In [None]:
# Save raw feature column names
raw_cols = list(train_df.columns[1:-2])

## Perform Yeo-Johnson Transform, Standardization, PCA on transformed data, separate out features and target, split in to train and validation sets

In [None]:
from sklearn.model_selection import train_test_split

pca_tr = PCA(n_components=70)

# Separate Targets and Features
X = pd.DataFrame(pca_tr.fit_transform(sclr.fit_transform(train_df[raw_cols])))
X_test = pd.DataFrame(pca_tr.fit_transform(sclr.fit_transform(data_dict['test'][raw_cols])))
y = train_df['target_enc']


print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print()

# Stratified train validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state= 42, test_size=0.20)
print(f'X Train {X_train.shape}')
print(f'X Val {X_val.shape}')
print(f'X Test {X_test.shape}')
print(f'y Train {y_train.shape}')
print(f'y Val {y_val.shape}')

In [None]:
# Check Category Percentages from stratification
print('Original Splits:')
print(y.value_counts() / len(y), '\n')
print('Training Splits:')
print(y_train.value_counts() / len(y_train), '\n')
print('Validation Splits:')
print(y_val.value_counts() / len(y_val), '\n')

# Model Selection
The competition metric is log loss.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, accuracy_score
import joblib

def clfr_perfomance(y_true,X_, model):
    print(f'Accuracy: {accuracy_score(y_true,model.predict(X_)):0.4f}')
    print(f'Log Loss Function: {log_loss(y_true,model.predict_proba(X_)):0.4f}')

Since we need to predict class probablies, find all the classifiers with predict_proba function in sklearn

In [None]:
from sklearn.utils import all_estimators

estimators = all_estimators()

for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)

## Setup Gridsearch with crossvalidation to opt hyperparams

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

mdls = {'rf': RandomForestClassifier(n_jobs=-1),
       'gbc': GradientBoostingClassifier(),
       'mlp': MLPClassifier()}

prms = {'rf': {'n_estimators': [2**i for i in range(3,8)],
               'max_depth':  [8,16,32,64,None]},
        'gbc': {'n_estimators': [250,500],
               'max_depth':  [1,5,9],
               'learning_rate': [0.001,0.01,0.1]},
        'mlp': {'hidden_layer_sizes': [(10,),(50,),(100,),(200,)],
               'activation': ['logistic','tanh','relu'],
               'learning_rate': ['constant','invscaling','adaptive']}
        
        }

# Train Models for Evaluation

In [None]:
def train_model(key,models,params):
    print('Training model: {}'.format(key))
    gs_cv = GridSearchCV(models[key],params[key], cv = 5,
                         scoring='neg_log_loss', n_jobs = -1, verbose=1)
    best_est = gs_cv.fit(X_train, y_train)
    print('Best Estimator: {}'.format(best_est.best_params_))
    print('Best Estimator Score: {}'.format(best_est.best_score_))
    joblib.dump(best_est.best_estimator_,'{}_tr.pkl'.format(key))

Training model: rf <br>
Fitting 5 folds for each of 25 candidates, totalling 125 fits<br>
Best Estimator: {'max_depth': 16, 'n_estimators': 128}<br>
Best Estimator Score: -1.7731683449052305<br>


Training model: mlp <br>
Fitting 5 folds for each of 36 candidates, totalling 180 fits<br>
Best Estimator: {'activation': 'tanh', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}<br>
Best Estimator Score: -1.7609277700753556<br>


CV Score for MLP min: 1.7457966814685377, max: 1.756144311735689, mean: 1.7520049992817097, median: 1.7515735323667034

Training model: gbc <br>
Fitting 5 folds for each of 18 candidates, totalling 90 fits <br>
*Did not converge in a reasonable amount of time*

# Final Model Training and Optimization
The multilayer perceptron performed the best, so we will use a Keras MLP as the final model

In [None]:
from keras.models import Sequential
from keras.layers import *
from keras.losses import SparseCategoricalCrossentropy

# Define the model
model = Sequential()
model.add(Dense(70, input_dim = 70, activation='tanh', name='input'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu', name = 'hidden'))
model.add(Dense(9, activation='sigmoid', name = 'output')) 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  

model.summary()

### Callbacks

In [None]:
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
lr_dec = ReduceLROnPlateau(monitor='loss')
es_callback = EarlyStopping(monitor='loss', restore_best_weights=True)

### Train Model

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohc = OneHotEncoder(sparse=False)

# fit model
np.random.seed(42)
hist = model.fit(X_train.values,ohc.fit_transform(y_train.values.reshape(-1,1)),verbose = 1,
                 epochs=200,callbacks=[lr_dec, es_callback])

## Check Validation Score


In [None]:
model.evaluate(x=X_val.values,y=ohc.fit_transform(y_val.values.reshape(-1,1)))

## Create Submission

In [None]:
X_test['id'] = data_dict['test']['id']
sub_df = data_dict['sample_submission']
sub_df['id'] = X_test['id']
pred = model.predict(X_test.drop(columns=['id']))
sub_df[[x for x in sub_df.columns if x != 'id']] = pred 
#sub_df.set_index(columns = 'id', inplace=True)
sub_df.head()

In [None]:
sub_df.set_index('id', drop=True).head()

In [None]:
# write submission to file
sub_df.set_index('id', drop=True).to_csv('submission.csv')