### SVC Model + PCA + Identifying on which targets the models perform poorly?

### 1. Environment + Setup

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import KFold
from sklearn import metrics



from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor

from joblib import dump, load
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, brier_score_loss, precision_score, recall_score, f1_score
from datetime import date
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### 2. Data Preparation

In [None]:
train_features_dtypes = {"cp_type": "category","cp_dose": "category"}

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv',dtype = train_features_dtypes)

In [None]:
train_features['train'] = 1

In [None]:
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv',dtype = train_features_dtypes)

In [None]:
test_features['train'] = 0

In [None]:
temp = pd.concat([train_features,test_features],axis = 0)

In [None]:
del train_features,test_features

In [None]:
temp.shape

In [None]:
for col, col_dtype in train_features_dtypes.items():
    if col_dtype == "category":
        temp[col] = temp[col].cat.codes.astype("int16")
        temp[col] -= temp[col].min()

In [None]:
train_features = temp[temp['train'] == 1].copy()
test_features = temp[temp['train'] == 0].copy()
del temp

In [None]:
train_features = train_features.drop(columns='train')
test_features = test_features.drop(columns='train')

In [None]:
test_features.shape

In [None]:
train_features.shape

In [None]:
train_features.head()

In [None]:
train_features[['cp_type']].info()

In [None]:
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')

In [None]:
list(train_targets_scored.columns)

In [None]:
train_targets_scored.shape

In [None]:
train_targets_scored.head()

In [None]:
df = pd.merge(train_features,train_targets_scored,how='inner',on='sig_id')

In [None]:
df.shape

In [None]:
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
sample_submission.shape

In [None]:
sample_submission.head()

In [None]:
submission = sample_submission.copy()
for col in submission.columns[1:]:
    submission[col].values[:] = 0


In [None]:
X_cols = list(train_features.drop('sig_id',axis=1).columns)

In [None]:
X_cols#[:5]

In [None]:
y_cols = list(train_targets_scored.drop('sig_id',axis=1).columns)

### 3. Model



SVM is a margin-based classifier that seeks to maximize the margin, or hyperplane, between the two classes. The model learns by support vectors, which are data points that are relatively close to or at the margin, and these support vectors influence the position and orientation of the hyperplane.

We will grid search the kernel to be used for SVM as well as the regularization parameter "C", which is inversely proportional to regularization. So the bigger C is, the less regularization there is, and the smaller the margin, which will in turn have less support vectors. Conversely, the smaller C, the more regularization there is, and the model seeks to fit a larger margin even if there are misclassified points in your training data.


In [None]:
nfolds=5

In [None]:
# prepare split
kf = KFold(n_splits = nfolds)

In [None]:
# base model definition throught sklearn Pipeline
pca = PCA(n_components = 300)
svm0 = SVC(C = 0.1,probability =True)

base_model = Pipeline(steps=[('pca', pca), ('svm', svm0)])

mo_base = MultiOutputClassifier(base_model, n_jobs=-1)

In [None]:
xtrain = df[X_cols]#.head(1000)  set to small value for testing code

In [None]:
xtrain.shape

In [None]:
ytrain = df[y_cols]#.head(1000) set to small value for testing code

In [None]:
xtest = test_features[X_cols]

In [None]:
# storage matrices for OOF / test predictions
prval = np.zeros(ytrain.shape)


In [None]:
prval.shape

In [None]:
#kfold cv 
for (ff, (id0, id1)) in enumerate(kf.split(xtrain)):
     
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = np.array(ytrain.loc[id0]), np.array(ytrain.loc[id1])
    
    # fix for empty columns
    check_for_empty_cols = np.where(y0.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y0[0,check_for_empty_cols] = 1
    
    # fit model
    mo_base.fit(x0,y0)
    
    # predicitons
    prv = mo_base.predict_proba(x1)#[:, 1] see note below, this does not appear to work on a multioutput scenario
    prf = mo_base.predict_proba(xtest)#[:, 1]
    
    # some tactical workarounds to get SVC and MultiOutputClassifier outputs into a workable format, 
    # as predict_proba generates probability of both pos and neg class, we need to cycle through each
    # target prediction and take the one we want.
    prv_n = []
    for i in range(0,206):
    #     print(i)
        prv_n.append(prv[i][:, 1])
    prf_n = []
    for i in range(0,206):
    #     print(i)
        prf_n.append(prf[i][:, 1])
    # generate the prediction
    prval[id1,:] = pd.DataFrame(prv_n).T #formatting into dataframe and transpose to line up data 
    prf_n_df = pd.DataFrame(prf_n).T #formatting into dataframe and transpose to line up data 
    prf_n_df.columns = y_cols
    for i in y_cols:
        submission[i] += prf_n_df[i] / nfolds
#     print(ff)
#     print(pd.DataFrame(prf_n).T.shape)
#     print(pd.DataFrame(prf_n).T.head(2))
#     print(submission.head(2))
    

In [None]:
submission.shape

In [None]:
submission.head()

### 4. Evaluation

#### 4.1 Overall performance

In [None]:
def log_loss_metric(y_true, y_pred):
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = - np.mean(np.mean(y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip), axis = 1))
    return loss

In [None]:
#overall model performance

print(f'Model OOF Metric: {log_loss_metric(ytrain, prval)}')



#### 4.2 Model Weaknesses

Where could we focus efforts to improve models for the next iteration

In [None]:
prval_df = pd.DataFrame(prval)

In [None]:
prval_df.columns = y_cols

In [None]:
prval_df.head()

In [None]:
def log_loss_metric_ind(y_true, y_pred):
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = - np.mean(np.mean(y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip)))
    return loss

In [None]:
#highlight the worst performing models
perf_check = []
for i in y_cols:
    perf_check.append((i,log_loss_metric_ind(ytrain[i], prval_df[i])))

In [None]:
results = pd.DataFrame(perf_check)

In [None]:
results.columns = ['target','log_loss']

In [None]:
#worst performing models
results.sort_values('log_loss',ascending=False).head(20)

In [None]:
#best performing models
results.sort_values('log_loss',ascending=True).head(20)

### 5. Submission

In [None]:
submission.shape

In [None]:
submission.head()

In [None]:

#test_export = test.loc[:, ['id', 'target']]
submission.to_csv('submission.csv', index=False)