In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold 
from xgboost import XGBRegressor
from sklearn.metrics import log_loss
import seaborn as sns
from pandas_profiling import ProfileReport

In [None]:
sample = pd.read_csv("../input/lish-moa/sample_submission.csv")

test_f = pd.read_csv('../input/lish-moa/test_features.csv')
train_f = pd.read_csv('../input/lish-moa/train_features.csv')
drug = pd.read_csv('../input/lish-moa/train_drug.csv')
targ_nscore = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
targ_score = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
train = train_f.merge(targ_score,on = 'sig_id',how = 'left')
train = train.merge(targ_nscore,on = 'sig_id',how = 'left')

# EDA

In [None]:
stargs_name = list(targ_score.columns[1:])
scored_targets = train[list(targ_score.columns[1:])].sum(axis = 1)
nscored_targets = train[list(targ_nscore.columns[1:])].sum(axis = 1)

fig,axes = plt.subplots(figsize = (32,8),ncols = 2)
sns.countplot(scored_targets,ax = axes[0])
sns.countplot(nscored_targets,ax = axes[1])
# scored_targets

for i in range(2):
    axes[i].tick_params(axis = 'x',labelsize =20)
    axes[i].tick_params(axis = 'y', labelsize = 20)

axes[0].set_title(f'Training set unique scored per sample',size = 22 , pad = 22)  
axes[1].set_title(f'Training set unique not scored per sample',size = 22 , pad = 22)   
plt.show()

In [None]:
fig, axes = plt.subplots(figsize = (24,24),nrows = 3, ncols = 2)

sns.countplot(train_f['cp_type'],ax = axes[0][0])
sns.countplot(test_f['cp_type'],ax = axes[0][1])

sns.countplot(train_f['cp_time'],ax = axes[1][0])
sns.countplot(test_f['cp_time'],ax = axes[1][1])

sns.countplot(train_f['cp_dose'],ax = axes[2][0])
sns.countplot(test_f['cp_dose'],ax = axes[2][1])

for i, f in enumerate(['cp_type','cp_time','cp_dose']):
    for j , d in enumerate(['training','test']):
        axes[i][j].set_title(f'{d} Set {f} Distribution',size = 20,pad = 15)

## Inhibitors are molecules that binds an enzyme and decreases its activity 

## Agonist are chemicals that binds a receptor and activates the receptor to produce biological response.

## Antagonist block the action of agonist 

In [None]:
len(train_f) - len(test_f)

In [None]:
test_f.shape

In [None]:
train_f.shape

In [None]:
train_f.columns

In [None]:
len(targ_score)

In [None]:
train.head()

# Model

In [None]:
drug.head()

## **train_features.csv** - 
  ### Features for the training set. 
  
  Features **g-** signify **gene expression** data, and **c-** signify **cell viability** data.
  
  **cp_type** indicates samples treated with a compound *(cp_vehicle)* or with a control perturbation *(ctrl_vehicle)*; control perturbations have no MoAs; 
    
  **cp_time and cp_dose** indicate treatment *duration* (24, 48, 72 hours) and *dose *(high or low).

## Gene Expression - 

Gene expression is the process by which information from a gene is used in the synthesis of a functional gene product. These products are often proteins, but in non-protein-coding genes such as transfer RNA or small nuclear RNA genes, the product is a functional RNA.

## Cell viability -

Cell viability assays use a variety of markers as indicators of metabolically active (living) cells

## Encode cp_type and cp_dose

In [None]:
def preprocess(df):
    df = df.copy() 
    # df.loc[:,'something'] = in df locate all(:) rows and take col with 'something' key
    df.loc[:,'cp_type'] = df.loc[:,'cp_type'].map({'trt_cp':0,'ctl_vehicle':1})
    df.loc[:,'cp_dose'] = df.loc[:,'cp_dose'].map({'D1':0,'D2':1})
    del df['sig_id']
    return df

train = preprocess(train_f)
test = preprocess(test_f)
del targ_score['sig_id']

### for trp_cp - Dose 1 
### for ctl_vehicle - Dose 2



In [None]:
targ_score.shape

In [None]:
def metric(y_true,y_pred):
    metrics = []
    metrics.append(log_loss(y_true,y_pred.astype(float),labels = [0,1])) #loss algortithm
    return np.mean(metrics)

In [None]:
test_f.head()

In [None]:
cols = targ_score.columns
submission = sample.copy()
submission.loc[:,cols] = 0
submission


N_splits = 5
off_loss = 0
for c, columns in enumerate(cols,1):
    y = targ_score[columns]
    total_loss = 0
    
    for fn,(trn_idx,val_idx) in enumerate(KFold(n_splits = N_splits, shuffle = True).split(train)):
    # trn_idx , val_idx are the shuffled indexes for train and validation  
        print('Fold :',fn+1)
        X_train,X_val = train.iloc[trn_idx],train.iloc[val_idx] # locate data based on random index generated using KFold for training and testing
        y_train,y_val = y.iloc[trn_idx],y.iloc[val_idx]
        
        model = XGBRegressor(tree_method = 'gpu_hist',
                           min_child_weight = 1,
                           learning_rate = 0.015,
                           colsample_bytree = 0.65,
                           gamma = 3.69,
                           max_delta_step = 2.07,
                           max_depth = 10,
                           n_estimators = 207,
                           subsample = 1)
        
        model.fit(X_train,y_train)
        pred = model.predict(X_val)
        loss = metric(y_val,pred)
        total_loss += loss
        predictions = model.predict(test)
        submission[columns] += predictions/N_splits
        
    off_loss += total_loss/N_splits #average loss
    print('Model '+str(c)+":Loss = "+str(total_loss/N_splits))

In [None]:
off_loss/100

In [None]:
submission

In [None]:
submission.loc[test['cp_type']== 1,targ_score.columns] = 0

In [None]:
submission.to_csv('submission.csv',index = False)