* This will explore the dataset.
* Will build a random model and see the performance. 

# EDA

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df1 = pd.read_csv("../input/lish-moa/train_features.csv")
df2 = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
df3 = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")

print("shape of train_features.csv: ", df1.shape)
print("shape of train_targets_scored.csv: ", df2.shape)
print("shape of train_targets_nonscored.csv: ", df3.shape)

print("Total number of datapoints: {:,}".format(df1.shape[0]))

df1.head()

* There are **876** columns in `train_features.csv` file and out of these there are **875** features. 
* There are **207** labels that we need to predict for each datapoint.
* Most of features look real-valued (continuous) and few of them look are categorical type. We'll see them detail.

In [None]:
df1.describe()

## Data-type of features 

In [None]:
# knowing the data types of each column of `train_features.csv`
data_types = df1.dtypes
unique_dtypes = data_types.unique()
print("number of dtypes in `train_features.csv`: ", len(unique_dtypes),
      "\nAnd these are: ", unique_dtypes)

Obj   = []
Int   = []
Float = []
for col, data_type in zip(df1.columns, data_types):
    if data_type == 'object':Obj.append(col)        
    elif data_type == 'int64':Int.append(col)
    elif data_type == 'float64':Float.append(col)
print("number of object data type: ", len(Obj))
print("number of int64 data type: ", len(Int))
print("number of float64 data type: ", len(Float))

assert len(Obj)+len(Int)+len(Float) == df1.shape[1]

In [None]:
Obj, Int

* There is only one column for `int64` column and `3` for the `object` datatype.
* `cp_dose` and `cp_type` are categorical variable.
* `cp_time` has `int64` datatype, but this one is also categorical variable. 
* Let's see the number of unique values in these columns.

In [None]:
print("Number of unique values in `cp_type` col is: {} and these are: {}"
      .format(len(df1.loc[:, "cp_type"].unique()), df1.loc[:, "cp_type"].unique()))

print("Number of unique values in `cp_dose` col is: {} and these are: {}"
      .format(len(df1.loc[:, "cp_dose"].unique()), df1.loc[:, "cp_dose"].unique()))

print("Number of unique values in `cp_time` col is: {} and these are: {}"
      .format(len(df1.loc[:, "cp_time"].unique()), df1.loc[:, "cp_time"].unique()))

* One hot encoding for `cp_type` and `cp_dose`.
* `D1` == `0`, `D2`==`1`.
* `trt_cp`== `0`, `ctl_vehicle` ==`.
* Label encoding for`cp_time`, `0` for 24, `1` for 48 and `2` for 72.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig, axs = plt.subplots(1,3, figsize=(18,6))

fig.suptitle("Count Plot of Categorical variables", fontsize = 24)
sns.countplot(x ='cp_type', data = df1, ax = axs[0])
axs[0].set_title("For: cp_type", fontsize= 16)
sns.countplot(x ='cp_dose', data = df1, ax = axs[1])
axs[1].set_title("For: cp_dose", fontsize = 16)
axs[1].set(ylabel = '')
sns.countplot(x = 'cp_time', data = df1, ax = axs[2])
axs[2].set_title("For: cp_time", fontsize = 16)
axs[2].set(ylabel = '')

plt.show()

* For both variables `cp_dose` and `cp_time` number of count is almost same for all unique values present in their columns.
* But for `cp_type` there are very data points that corresponds to `ctl_vehicle`.

**NOTE:** Use stratified spliting with `cp_type` column. This will maintain the `cp_type`'s uniques values count-ratio.  

## Encoding

In [None]:
# cp_type, cp_time, cp_dose
def encode_cp_time(row):
    val = None
    if row == 24:val = 1
    elif row == 48:val = 2
    else:val = 3
    return val
        
df1["cp_type"] = df1["cp_type"].apply(lambda x: 0 if x=='trt_cp' else 1)
df1["cp_dose"] = df1["cp_dose"].apply(lambda x: 0 if x=='D1' else 1)
df1["cp_time"] = df1["cp_time"].apply(encode_cp_time)

df1.head()

## PCA & Correlation

* Let's normalise the features, first

In [None]:
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

### PCA

In [None]:
X = df1.iloc[:, 1:]
x = StandardScaler().fit_transform(X)
x = pd.DataFrame(data=x, columns = X.columns)
x.head()

* There are `875` features, first let's reduce the dimension using **PCA**.

In [None]:
pca = PCA(n_components=None, svd_solver = 'full')
pca.fit_transform(x)
var = pca.explained_variance_ratio_

In [None]:
for i in range(1,9):
    print("Variance explained by top {} components: {}".format(i*100, var[:i*100].sum()))

* I would always prefer high variance explained (between 95-99%). But for that, I have to take atleast 600 components.
* 800 components are explainig the 99.41% (approx) variance.
* Now, one can select 100, 200, 300, .... any number of features and see the performance of model.

### Correlation

**Source:** https://www.geeksforgeeks.org/exploring-correlation-in-python/

In [None]:
corrmat = X.corr()

In [None]:
f, ax = plt.subplots(1,1, figsize =(18, 8))
sns.heatmap(corrmat, ax = ax)
plt.show()

* From above plot, it can be see that the features that starts with `c-`  looks highly correlated to each other. These are also shwoing strong correaltion with other features also.
* Let's see the heatmap of features that starts with `c-`.

In [None]:
cols = [col for col in corrmat.columns if col.startswith('c-')]
f, ax = plt.subplots(1,1, figsize =(18, 8))
sns.heatmap(corrmat.loc[cols, cols], ax = ax)
plt.show()

* The most of thefeatures that start with `c-` are highly correlated.

**NOTE:** If you're using all features, use tree-based model or high dropout in first (just after input) in neural-network architecture.

# Random Model

* This will predict the random value between 0 and 1 (inclusive, i.e. 0<=prob<=1) for each label of each datapoints.
* We'll calculate `the worst model` performance. 

In [None]:
from sklearn.metrics import log_loss
def total_loss(y_true, y_pred):
    """
    y_true: numpy nd-array of shape (None , 206), None means any value
    y_pred: numpy nd-array of shape (None , 206)
    """
    losses = []
    for i in range(y_true.shape[1]):losses.append(log_loss(y_true[:,i], y_pred[:,i], eps=1e-15))
    return np.mean(losses)

In [None]:
df_test = pd.read_csv("../input/lish-moa/test_features.csv")
print("number of test datapoints: {:,}".format(df_test.shape[0]))

y_train = df2.iloc[:, 1:].values
y_train_pred = np.random.random_sample(y_train.shape) 
y_test_pred = np.random.random_sample((df_test.shape[0], y_train.shape[1])) 

In [None]:
tr_loss = total_loss(y_train, y_train_pred)
print("train loss: ", tr_loss)

In [None]:
test_df = pd.DataFrame(data = y_test_pred, columns = df2.columns[1:])
temp = pd.DataFrame(data=df_test.loc[:, 'sig_id'])
test_df = pd.concat([temp, test_df], ignore_index=False, axis=1)
test_df.head()

In [None]:
test_df.to_csv("./submission.csv",index=False)

**Splitting the data**

```Python
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
df_train = df1.merge(df2, on = 'sig_id')
print("shape: ", df_train.shape)
df_train.head()

train, val = train_test_split(df_train, test_size = 0.2, random_state=42,
                             stratify = df_train["cp_type"])
X_train, Y_train = df_train.iloc[:, :-206], df_train.iloc[:, -206:]
X_val, Y_val = val.iloc[:, :-206], val.iloc[:, -206:]

print("Number of datapoints in train-set: {:,}".format(len(X_train)))
print("Number of datapoints in val-set: {:,}".format(len(X_val)))
```