# Import Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy import stats
import matplotlib.style as style
style.use('fivethirtyeight')

type_colors = sns.color_palette("hls", 16)

# Read and Describe the Dataset

In [None]:
# Read the Dataset

data_train=pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
data_test = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")
target_scored = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")
target_nonscored = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")


In [None]:
# First Five rows

data_train.head()

In [None]:
# Shape of the training data

data_train.shape

In [None]:
# Check for Null

data_train.isna().sum()

In [None]:
# Check if id is unique

data_train.sig_id.nunique()

**Id is same as shape of the data i.e number of rows.**

In [None]:
print('There are  {:} rows in training data.'.format(len(data_train)))

In [None]:
# Describe the training Dataset

data_train.describe()

### We have 873 columns with Numerical Features and 2 columns with Categorical Features(cp_type and cp_dose)

# Visualization

In [None]:
def plot_fn(df, feature):

    ## Create a chart
    fig = plt.figure(constrained_layout=True, figsize=(12,8))
    ## create a grid of 3 cols and 3 rows. 
    grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
    

    ## Customizing the histogram grid. 
    ax1 = fig.add_subplot(grid[0, :2])
    ## Set the title. 
    ax1.set_title('Histogram')
    ## plot the histogram. 
    sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)

    # customizing the QQ_plot. 
    ax2 = fig.add_subplot(grid[1, :2])
    ## Set the title. 
    ax2.set_title('QQ_plot')
    ## Plotting the QQ_Plot. 
    stats.probplot(df.loc[:,feature], plot = ax2)

    ## Customizing the Box Plot. 
    ax3 = fig.add_subplot(grid[:, 2])
    ## Set title. 
    ax3.set_title('Box Plot')
    ## Plotting the box plot. 
    sns.boxplot(df.loc[:,feature], orient='v', ax = ax3 );

## Cell Feature distribution

In [None]:
# c-90 cell
plot_fn(data_train, 'c-90')

In [None]:
# c-0 cell
plot_fn(data_train, 'c-0')

In [None]:
# c-93 cell
plot_fn(data_train, 'c-93')

## Gene Feature distribution

In [None]:
# g-90 gene
plot_fn(data_train, 'g-90')

In [None]:
# g-0 gene
plot_fn(data_train, 'g-0')

In [None]:
# g-93 gene
plot_fn(data_train, 'g-93')

### Nearly distribution in all cell features are same.
### Nearly distribution in all gene features are same.

In [None]:
cp_plot = data_train.cp_type.value_counts()
ax = cp_plot.plot(kind='bar', figsize=(10, 5),   # barh -> for Horizontal rectangles plot & bar -> Vertical rectangles plot
          title='Category wise Contribution',
          color=type_colors)
for i, (p, pr) in enumerate(zip(cp_plot.index, cp_plot.values)):
    
    plt.text(s=str(pr), y=pr-5, x=i, color="b",
             horizontalalignment='center', verticalalignment='top',
              size=14)
ax.set_xlabel("Group")
ax.set_ylabel("Count")
plt.xticks(rotation= 45) 
plt.show()

### cp_type have two categories(trt_cp and ctl_vehicle). cp_type is highly imbalance as ctl_vehicle has lower count .

In [None]:
cp_dose_plot = data_train.cp_dose.value_counts()
ax = cp_dose_plot.plot(kind='bar', figsize=(10, 5),   # barh -> for Horizontal rectangles plot & bar -> Vertical rectangles plot
          title='Category wise Contribution',
          color=type_colors)
for i, (p, pr) in enumerate(zip(cp_dose_plot.index, cp_dose_plot.values)):
    
    plt.text(s=str(pr), y=pr-5, x=i, color="b",
             horizontalalignment='center', verticalalignment='top',
              size=14)
ax.set_xlabel("Group")
ax.set_ylabel("Count")
plt.xticks(rotation= 45) 
plt.show()

### cp_dose have two categories(D1 and D2)

# Let's see the target data

In [None]:
## Target Scored Multi-Label data
target_scored.head()

In [None]:
## Target non-Scored Multi-Label data
target_nonscored.head()

In [None]:
target_scored.sum()[1:].sort_values()

**Some rows have less number of 1's value. Most of the data are belonging to one class (i.e 0's). So, data is highly imbalance.**

# Check for Skewness
If **skewness is positive**, the data are positively skewed or **skewed right**, meaning that the right tail of the distribution is longer than the left. If **skewness is negative**, the data are negatively skewed or **skewed left**, meaning that the left tail is longer.


*     If skewness is less than −1 or greater than +1, the distribution is highly skewed.
*     If skewness is between −1 and −½ or between +½ and +1, the distribution is moderately skewed.
*     If skewness is between −½ and +½, the distribution is approximately symmetric.



## Skewness in target_scored

In [None]:
## Skewness in target_scored

target_scored.skew().sort_values()

**We can see values are greater than +1 so data is highly skewed.**

In [None]:
target_nonscored.sum()[1:].sort_values()  # remove the first column id(sig_id) and sort the values

## Skewness in target_nonscored

In [None]:
## Skewness in target_nonscored

target_nonscored.skew().sort_values()

**We can see values are greater than +1 so data is highly skewed.**

## Skewness in training data

In [None]:
data_train.skew().sort_values()

**Data is highly skewed as we have already seen in visualization also.**

# Remove the Skewness

## Sigmoid Function

In [None]:
def sig_fn(data):
    e = np.exp(1)
    y = 1/(1+e**(-data))
    return y

## Apply Sigmoid Function on training data
* Get all numerical columns from the Dataset.
* Apply Sigmoid Function and create a new Dataset.
* Find the Skewness on new Dataset

In [None]:
## Get all numerical columns and create new dataset
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeri_train = data_train.select_dtypes(include=numerics)

In [None]:
#numeri_train.head()

In [None]:
## Apply the sigmoid function on training data
sig_data = sig_fn(numeri_train)

## Find the Skewness
#sig_data.skew()
sig_data.skew().sort_values()

**Values are in the range from -0.809382 to 1.197749 which is an improvement.**

In [None]:
sig_data.head()

# Check the test data

In [None]:
# First Five rows
data_test.head()

In [None]:
# Shape of the training data

data_test.shape

## Skewness

In [None]:
data_test.skew().sort_values()

**Data is highly skewed**

In [None]:
## Get numerical columns on test data
numeri_test = data_test.select_dtypes(include=numerics)

## Apply the sigmoid function on test data
sig_data_test = sig_fn(numeri_test)

## Find the Skewness
#sig_data_test.skew()
sig_data_test.skew().sort_values()

**Values are in the range from -0.933437 to 1.201311 which is an improvement.**

# Model Prepration

## Concatenate training and test Dataset

In [None]:
data_train=data_train[list(data_test)]
all_data=pd.concat((data_train, data_test))
print(data_train.shape, data_test.shape, all_data.shape)

## Apply Dummies on Dataset

In [None]:
## Apply Dummies

all_data = pd.concat([all_data, pd.get_dummies(all_data['cp_dose'], prefix='cp_dose', dtype=float)],axis=1)
all_data = pd.concat([all_data, pd.get_dummies(all_data['cp_time'], prefix='cp_time', dtype=float)],axis=1)
all_data = pd.concat([all_data, pd.get_dummies(all_data['cp_type'], prefix='cp_type', dtype=float)],axis=1)
all_data = all_data.drop(['cp_dose', 'cp_time', 'cp_type'], axis=1)

In [None]:
## After Dummies

## Create a copy of data
full_data = all_data.copy()
all_data.head()

## Check Skewness 
Check Skewness on whole dataset (training and test)

In [None]:
## Check Skewness on whole dataset (training and test)
all_data.skew().sort_values()

## Remove Skewness 
Remove Skewness on whole dataset (training and test) by following:


*     Apply Sigmoid Function and create a new Dataset.
*     Find the Skewness on new Dataset


In [None]:
## Get numerical columns from whole dataset
numeri_all = all_data.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_data_all = sig_fn(numeri_all)

## Find the Skewness

sig_data_all.skew().sort_values()

In [None]:
sig_data_all.head()

In [None]:
def normalize_fn(data):
    upper = data.max()
    lower = data.min()
    y = (data - lower)/(upper-lower)
    return y
data_normalized = normalize_fn(sig_data_all)

data_normalized.skew().sort_values()

In [None]:
data_normalized.head()

**Normalization does not improve our results.**

In [None]:
# data_log = np.log(numeri_all + 1)
# data_log_normalized = normalize_fn(data_log)
# data_log_normalized.describe()


# Divide by Zero Occurs

In [None]:
# def sig_inf_fn(data):
#     e = np.exp(1)
#     y = 2/(1+e**(-data))
#     return y



## Apply the  function
#sig_data_al = sig_inf_fn(numeri_all)

## Find the Skewness
#sig_data_al.skew().sort_values()
## Plot the Skewness values to check the value range


## Plot the Skewness
Check the range of values of skewness by plotting the graph

In [None]:
plt.plot(sig_data_all.skew())

**We can see skewness lies between [-1, 1] except some feature columns (g-213, cp_type_ctl_vehicle & cp_type_trt_cp).
Let's drop these columns to see the effect.**


In [None]:
all_data = all_data.drop(['g-213', 'cp_type_ctl_vehicle', 'cp_type_trt_cp'], axis=1)
numeri_all = all_data.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_data_all = sig_fn(numeri_all)

## Find the Skewness

#sig_data_all.skew().sort_values()

plt.plot(sig_data_all.skew())

**Result is improved after dropping some columns.**

In [None]:
Xtrain=all_data[:len(data_train)]
Xtest=all_data[len(data_train):]


In [None]:
plt.plot(Xtrain.skew())

**On removing skewness from whole data training and test dataset have skewnees. So let's first separate the data from whole data then remove skewness.**

## Separate the data

In [None]:
## Separate the data
Xtrain=full_data[:len(data_train)]
Xtest=full_data[len(data_train):]

## Training Dataset

In [None]:
## Get numerical columns from training dataset
numerical_train = Xtrain.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_data_train = sig_fn(numerical_train)

## Find the Skewness

plt.plot(sig_data_train.skew())

## Test Dataset

In [None]:
## Get numerical columns from test dataset
numerical_test = Xtest.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_data_test = sig_fn(numerical_test)

## Find the Skewness

plt.plot(sig_data_test.skew())

**We can see skewness lies between [-1, 1] except some feature columns (g-213, cp_type_ctl_vehicle & cp_type_trt_cp). Let's drop these columns to see the effect.**

## Training Dataset

In [None]:
final_train = Xtrain.drop(['g-213', 'cp_type_ctl_vehicle', 'cp_type_trt_cp'], axis=1)
numeri_final_train = final_train.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_data_final_train = sig_fn(numeri_final_train)

## Find the Skewness

#sig_data_final_train.skew().sort_values()

plt.plot(sig_data_final_train.skew())
#sig_data_final_train.head()

## Test Dataset

In [None]:
final_test = Xtest.drop(['g-213', 'cp_type_ctl_vehicle', 'cp_type_trt_cp'], axis=1)
numeri_final_test = final_test.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_data_final_test = sig_fn(numeri_final_test)

## Find the Skewness

#sig_data_final_test.skew().sort_values()

plt.plot(sig_data_final_test.skew())
#sig_data_final_test.head()

**Result is improved after dropping some columns.**

## Remove Skewness from target scored

In [None]:
numeri_target_score = target_scored.select_dtypes(include=numerics)

## Apply the sigmoid function
sig_target_score = sig_fn(numeri_target_score)

## Plot the Skewness

#plt.plot(sig_target_score.skew())
sig_target_score.skew()

In [None]:
sig_target_score.head()

In [None]:
# Check percentage
sig_target_score['5-alpha_reductase_inhibitor'].value_counts(normalize=True)



In [None]:
target_scored['11-beta-hsd1_inhibitor'].value_counts(normalize=True)

In [None]:
# Choose 1st target column and build a model
Ytrain=target_scored['11-beta-hsd1_inhibitor']

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
import scipy
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.metrics import log_loss, make_scorer


ftwo_scorer = make_scorer(log_loss)
ftwo_scorer

In [None]:
#kf=StratifiedKFold(n_splits=10)
kf = KFold(n_splits=10)

In [None]:
Xtrain=sig_data_final_train
Xtest=sig_data_final_test


In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(Xtrain, Ytrain)

In [None]:
y.value_counts(normalize=True)

In [None]:
print(list(target_scored.columns))

In [None]:
target_scored.atm_kinase_inhibitor.value_counts()

In [None]:
target_scored['5-alpha_reductase_inhibitor'].value_counts()

In [None]:
Ytrain = target_scored['atm_kinase_inhibitor']
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(Xtrain, Ytrain)

In [None]:
y.shape

In [None]:
Ytrain = target_scored['atm_kinase_inhibitor']
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(Xtrain, Ytrain)

In [None]:
y.value_counts()

In [None]:
Ytrain = target_scored['atm_kinase_inhibitor']
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

oversample = SMOTE(sampling_strategy=0.1)
undersample = RandomUnderSampler(sampling_strategy=0.5)

steps = [('o', oversample), ('u', undersample)]
pipeline = Pipeline(steps=steps)


X, y = pipeline.fit_resample(Xtrain, Ytrain)

In [None]:
y.value_counts()

In [None]:
y.shape

In [None]:
target_scored['atm_kinase_inhibitor'].skew()

In [None]:
y.skew()

# Save Dataset

In [None]:
## Save clean train and test dataset
#sig_data_final_train.to_csv('train_clean', index=False)
sig_data_final_test.to_csv('test_clean', index=False)



# Work in Progress......