# MoA TextBook Sklearn

 - Notebook is meant to be as simple as possible.  Using Sklearn as basic as possible.
 - Focus on Random Forest
 
Happy modeling!

# Imports

In [None]:
# standard imports
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
# modeling
from sklearn.ensemble import RandomForestClassifier
import scipy.stats.distributions as dists
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
#viz
import matplotlib.pyplot as plt
%matplotlib inline

# Load & Review Data

Let's load and review our training data ...

In [None]:
df = pd.read_csv("../input/lish-moa/train_features.csv")
df.head()

Let's see how many of our features are non-numeric.

In [None]:
df.columns[(df.dtypes.values != np.dtype('float64'))]

Everything appears to be roughly a numeric column except cp_type and cp_dose.  Let's review these features to determine how they should be handled.

In [None]:
df_cp_type = df[['cp_type','sig_id']].groupby('cp_type').count().reset_index()
df_cp_type.head()

Cp_type only has 2 values, so we can easily one-hot encode this.

In [None]:
df_cp_dose = df[['cp_dose','sig_id']].groupby('cp_dose').count().reset_index()
df_cp_dose.head()

Again cp_dose only has 2 values, so we can easily one-hot encode this as well.

Let's get a look at the size of our training data.

In [None]:
print("training dataset size: ", df.values.shape)

All of our targets are in a separate file, so let's get a look at our targets data.

In [None]:
targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
targets.head()

Let's review how large our targets data is.

In [None]:
print("training dataset target size: ", targets.values.shape)

From looking at the top of our datasets, looks like the ids in the training set and targets dataset are sorted by id.  Let's check this.

In [None]:
not_same = 0
for i,j in zip(df['sig_id'].values,targets['sig_id'].values):
    if i!=j:
        not_same += 1
print("if ids in training set and target data are ordered the same, the following value should be 0:")
print(not_same)

They are already sorted by id!  This will make things a lot easier as we move along.

# Feature Engineering

Let's handle those non-numeric columns.

In [None]:
df['ctl_vehicle'] = df['cp_type'].apply(lambda x: 1 if x=='ctl_vehicle' else 0)
df.drop(['cp_type'], axis=1, inplace=True)
df['D1'] = df['cp_dose'].apply(lambda x: 1 if x=='D1' else 0)
df.drop(['cp_dose'], axis=1, inplace=True)
df.head()

### Get Final Feature & Target Names

Okay, now let's store all of our feature names into a list, and all of our target names into a list.  We will use this later on.

In [None]:
features_list = df.columns.tolist()
features_list.remove('sig_id')
print("total features: ",len(features_list))
target_list = targets.columns.tolist()
target_list.remove('sig_id')
print("total target categories: ",len(target_list))

# View Target Distribution

If we want to model our target, we should probably understand it.  Let's get a look at the number of values per class.

In [None]:
# create a pandas dataframe with labels and a column called 'total' that we'll use to count on later
labels = np.argmax(targets[target_list].values, axis=1)
ldf = pd.DataFrame()
ldf['label'] = labels
ldf['total'] = 1
ldf.head()

In [None]:
# group by label and count up our ones to get a total number of oberservations per label
ldfgrp = ldf.groupby('label').agg({'total':'count'}).reset_index()
ldfgrp.head()

In [None]:
# plot distribution of labels
plt.bar(ldfgrp['label'],ldfgrp['total'])
plt.show()

Lots of zeros, and then we have a lot of other values that seem roughly evenly distributed.  Let's remove the 0 class and see how the rest of the classes look.

In [None]:
# remove 0 group
ldfgrpf = ldfgrp[ldfgrp.label!=0]
plt.bar(ldfgrpf['label'],ldfgrpf['total'])
plt.show()

With 0 group removed, seems pretty evenly distributed, roughly at least (i.e. no more groups that stand out).

Do all of our classes have sufficient data?  Let's take a look.

In [None]:
ldfgrp = ldfgrp.sort_values(by=['total'])
ldfgrp.head()

These classes are going to be very hard to predict :(.  We'll have to figure out a way to handle these low frequency classes.

In [None]:
ldfgrp = ldfgrp.sort_values(by=['total'], ascending=False)
ldfgrp.head(25)

We can't build a model for all classes because some classes have VERY low frequency.  We can build a model for the top 25 classes (shown above), and for the rest we can just submit the mean value from training set.

Let's save the mean predictions.

In [None]:
mean_predictions = targets.mean()
mean_predictions[:5]

Lets take a look at our original 'targets' dataframe again.  We need to filter this down to the top 25 columns with sufficient data.

In [None]:
targets[target_list].head()

Let's grab the top 25 classes, which we can use later on.

In [None]:
ldfgrp.label.values[:25]

In [None]:
new_target_idxs = ldfgrp.label.values[:25].tolist()
new_targets = [target_list[i] for i in new_target_idxs]
new_targets[:5]

In [None]:
targets[new_targets].head()

# Create Holdout

Let's split our data between train and holdout, so we can test our model later on.

In [None]:
X_train, X_holdout, y_train, y_holdout = train_test_split(df[features_list], np.argmax(targets[new_targets].values, axis=1), test_size=0.5, random_state=42)
print("training data size: ", X_train.values.shape)
print("training data target size: ", y_train.shape)
print("holdout data size: ", X_holdout.values.shape)
print("holdout data target size: ", y_holdout.shape)

# Modeling

So we're going to run Random Search CV on a Random Forest model.  If we use the entire dataset, this will take a very long time.  For demo purposes, we're going do downsample the training dataset.

In [None]:
# downsampling
X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=0.8, random_state=0)

We're going to run 2 fold cross validation for 4 iterations.  It would be better to run say 3 fold for 30 iterations, but that would take a long time, and so we reduce runtime by running fewer folds and iterations.  

In [None]:
clf = RandomForestClassifier(n_jobs=-1)

distributions = dict(n_estimators = dists.randint(4,1000),
                     max_depth = dists.randint(1,30),
                     max_features=dists.uniform(loc=0.05,scale=0.95))

search = RandomizedSearchCV(estimator=clf,
                           param_distributions=distributions,
                           verbose=1,
                           cv=2,
                           n_iter=4,
                           n_jobs=-1,
                           scoring='roc_auc_ovr_weighted',
                           random_state=0)

In [None]:
%%time

search_results = search.fit(X_train,y_train)

In [None]:
print("Best: %f using %s" % (search_results.best_score_,search_results.best_params_))

If we want, we can grab the best model as follows.  We can save this model as a pickle file as well, but we don't show that here.

In [None]:
model = search_results.best_estimator_

# Test Model on Holdout

Let's look at the performance of the model on holdout using the cohen kappa metric.

In [None]:
pred = np.argmax(model.predict_proba(X_holdout), axis=1)
print("holdout score: ", metrics.cohen_kappa_score(pred,y_holdout))

# Submission

Okay, it's time to make a submission.  We need to load the test data.

In [None]:
test = pd.read_csv("../input/lish-moa/test_features.csv")
test.head()

We also need to add our additional features to the test data.

In [None]:
test['ctl_vehicle'] = test['cp_type'].apply(lambda x: 1 if x=='ctl_vehicle' else 0)
test.drop(['cp_type'], axis=1, inplace=True)
test['D1'] = test['cp_dose'].apply(lambda x: 1 if x=='D1' else 0)
test.drop(['cp_dose'], axis=1, inplace=True)
test.head()

We can load our submission file as follows.

In [None]:
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")
sub.head()

Looks like the submission and test datasets are similarly sorted by id. Let's check this.

In [None]:
not_same = 0
for i,j in zip(test['sig_id'].values,sub['sig_id'].values):
    if i!=j:
        not_same += 1
print("if ids in training set and target data are ordered the same, the following value should be 0:")
print(not_same)

They are sorted the same!  This is going to make things a lot easier!

Now we need to make predictions on our test dataset using our model.

In [None]:
pred = model.predict_proba(test[features_list])
print("prediction output shape: ", pred.shape)

Now, we want to add our predictions to sample submission. First we add the mean values.

In [None]:
# make a copy of submission with all targets
new_sub = sub[target_list].copy()
new_sub.head()

In [None]:
# change prediction to average value from training data for each row
for i in tqdm(range(len(sub))):
    new_sub.loc[i,:] = mean_predictions
new_sub.head()

Next, we can add our predictions for the classes we modeled.

In [None]:
new_sub.iloc[:,new_target_idxs] = pred

Finally, we save our submission to disk.

In [None]:
# reload submissions file and add predictions (this way we have predictions and sig_id field in final submission file)
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")
sub.iloc[:,1:] = new_sub.values

In [None]:
# submit
sub.to_csv("submission.csv", index=False)
sub.head()

# The End

Thanks for reading, and hope you enjoyed!