# Set Up

In [None]:
# linear algebra
import numpy as np
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
#Unix commands
import os

# import useful tools
from glob import glob
from PIL import Image
import cv2
import pydicom
import scipy.ndimage
from skimage import measure 
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from skimage.morphology import disk, opening, closing
from tqdm import tqdm
from os import listdir, mkdir
from sklearn.decomposition import PCA
from IPython.display import HTML
from PIL import Image

# import data visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import plotly.express as px

from bokeh.plotting import figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs

# import data augmentation
import albumentations as albu

# import math module
import math

In [None]:
#Libraries
import pandas_profiling
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor

In [None]:
#used for changing color of text in print statement
from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [None]:
# One-hot encoding
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# Loading data

In [None]:
# Setup the paths to train and test images
DATASET = '../input/lish-moa'
TEST_DIR_CSV_PATH = '../input/lish-moa/test_features.csv'
TEST_FTR_PATH = '../input/lish-moa/test_features.csv'
TRAIN_FTR_PATH = '../input/lish-moa/train_features.csv'
TRAIN_CSV_NON_PATH = '../input/lish-moa/train_targets_nonscored.csv'
TRAIN_CSV_SCR_PATH = '../input/lish-moa/train_targets_scored.csv'

In [None]:
# Loading training data and test data
test_ftr = pd.read_csv(TEST_FTR_PATH)
train_ftr = pd.read_csv(TRAIN_FTR_PATH)
train_csv_non = pd.read_csv(TRAIN_CSV_NON_PATH)
train_csv_scr = pd.read_csv(TRAIN_CSV_SCR_PATH)

In [None]:
df = pd.concat([train_ftr, test_ftr])

In [None]:
#Loading Sample Files for Submission
sample = pd.read_csv('../input/lish-moa/sample_submission.csv')
# Confirmation of the format of samples for submission
sample.head(3).style.applymap(lambda x: 'background-color:lightsteelblue')

# Have A Look

[Code Requirements](https://www.kaggle.com/c/lish-moa/overview/code-requirements) say that No internet access enabled.Let's turn off the Internet.

In [None]:
HTML('<iframe width="800" height="500" src="https://www.youtube.com/embed/UMxsZdVrA7A" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')


# Checking data

In [None]:
print('Number of rows in test set: ', test_ftr.shape[0])
print('Number of columns in test set: ', test_ftr.shape[1] - 1)

In [None]:
test_ftr.head(5).style.applymap(lambda x: 'background-color:lightsteelblue')

In [None]:
print('Number of rows in training set: ', train_ftr.shape[0])
print('Number of columns in training set: ', train_ftr.shape[1] - 1)

In [None]:
train_ftr.head(5).style.applymap(lambda x: 'background-color:lightsteelblue')

In [None]:
print(f"{b_}Number of rows in train data: {r_}{train_ftr.shape[0]}\n{b_}Number of columns in train data: {r_}{train_ftr.shape[1]}")

In [None]:
train_csv_non.head(5).style.applymap(lambda x: 'background-color:lightsteelblue')

In [None]:
print(f"{b_}Number of rows in train data: {r_}{train_csv_non.shape[0]}\n{b_}Number of columns in train data: {r_}{train_csv_non.shape[1]}")

* The discussion of how to handle the non-scoring data is going on in [this thread](https://www.kaggle.com/c/lish-moa/discussion/180429) .

In [None]:
train_csv_scr.head(5).style.applymap(lambda x: 'background-color:lightsteelblue')

* [sig_id](https://clue.io/connectopedia/glossary) is a CMap unique identification number assigned to each signature generated from L1000 data.

In [None]:
print(f"{b_}Number of rows in train data: {r_}{train_csv_scr.shape[0]}\n{b_}Number of columns in train data: {r_}{train_csv_scr.shape[1]}")

In [None]:
# Check for missing values in the training features data
train_ftr.isnull().sum()

* Therefore, we can conclude that there is no missing training data

In [None]:
# Check for missing values in the training targets nonscored data
train_csv_non.isnull().sum()

* Therefore, we can conclude that there is no missing training data

In [None]:
# Check for missing values in the training targets scored data
train_csv_scr.isnull().sum()

* Therefore, we can conclude that there is no missing training data

In [None]:
df.info()

# Checking data statistics

* Features g- signify gene expression data.

In [None]:
# Number of Gene expression columns
train_ftr.columns.str.startswith('g-').sum()

* Features c- signify cell viability data.

In [None]:
# Number of Cell viability columns
train_ftr.columns.str.startswith('c-').sum()

In [None]:
# Check age-related statistics in the Training data
train_ftr.describe().style.applymap(lambda x: 'background-color:yellow')

In [None]:
# Check age-related statistics in the test data
test_ftr.describe().style.applymap(lambda x: 'background-color:lightgreen')

In [None]:
train_ftr.groupby( ['cp_type','cp_time','cp_dose'] ).agg( ['mean','std','count'] )

*  cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle).

In [None]:
# coding: utf-8
from tqdm import tqdm
import time

# Set the total value 
bar = tqdm(total = 1000)
# Add description
bar.set_description('Progress rate')
for i in range(100):
    # Set the progress
    bar.update(25)
    time.sleep(1)

# Data Visualization

In [None]:
plt.figure(figsize=(16, 16))
cols = [
    'c-1', 'c-2', 'c-3', 'c-4',
    'c-5', 'c-6', 'c-7', 'c-8',
    'c-92', 'c-93', 'c-94', 'c-95', 
    'c-96', 'c-97', 'c-98', 'c-99']
for i, col in enumerate(cols):
    plt.subplot(4, 4, i + 1)
    plt.hist(train_ftr.loc[:, col], bins=100, alpha=1,color='#00FFFF');
    plt.title(col)

In [None]:
plt.figure(figsize=(16, 16))
cols = [
    'g-1', 'g-2', 'g-3', 'g-4',
    'g-5', 'g-6', 'g-7', 'g-8',
    'g-92', 'g-93', 'g-94', 'g-95', 
    'g-96', 'g-97', 'g-98', 'g-99']
for i, col in enumerate(cols):
    plt.subplot(4, 4, i + 1)
    plt.hist(train_ftr.loc[:, col], bins=100, alpha=1,color='#800080');
    plt.title(col)

In [None]:
# Draw a pie chart about CPtypes of Training data.
plt.pie(train_ftr["cp_type"].value_counts(),labels=["trt_cp","ctl_vehicle"],autopct="%.1f%%")
plt.title("Ratio of CPtypes of Training data")
plt.show()

* [Data Description](https://www.kaggle.com/c/lish-moa/data) says that cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
* [The cp_time column](https://www.kaggle.com/c/lish-moa/discussion/184005) indicates the amount of time elapsed between adding the drug and when the measurement was taken. 

In [None]:
plt.figure(figsize=(15,5))
sns.distplot(train_ftr['cp_time'], color='blue', bins=10)
plt.title("Train: Treatment duration ", fontsize=15, weight='bold')
plt.show()

* The ratio of 24h to 48h to 72h isn't much different.

* [Data Description](https://www.kaggle.com/c/lish-moa/data) says that cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
* [The cp_dose column](https://www.kaggle.com/c/lish-moa/discussion/184005) indicates the dose level used in the experiment. Generally a higher dose will have a stronger effect.

In [None]:
# Draw a pie chart about CPtypes of Training data.
plt.pie(train_ftr["cp_dose"].value_counts(),labels=["D1","D2"],autopct="%.1f%%")
plt.title("Ratio of CPdose")
plt.show()

* The ratio of D1 to D2 isn't much different.

In [None]:
# Set the total value 
bar = tqdm(total = 1000)
# Add description
bar.set_description('Progress rate')
for i in range(100):
    # Set the progress
    bar.update(25)
    time.sleep(1)

In [None]:
plt.figure(figsize=(5,12))
plt.subplot(3,1,1)
splot = sns.countplot(train_ftr["cp_type"],color='#33FFCC')
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.title('cp_type')
plt.subplot(3,1,2)
sns.countplot(train_ftr['cp_time'],hue=train_ftr['cp_type'],color='#00FF00')
plt.title('cp_time and cp_type')
plt.subplot(3,1,3)
sns.countplot(train_ftr['cp_dose'],hue=train_ftr['cp_type'],color='#00FFFF')
plt.title('cp_dose and cp_type')
plt.tight_layout()

# Correlation analysis

In [None]:
# Calculate Pearson's r using pandas
res=train_ftr.corr() 

# Show correlation matrix
print(res)

In [None]:
# View the heat map of the correlation matrix
sns.heatmap(res,square=True)

# Acknowledgements

* [Mechanisms of Action (MoA) Prediction. EDA](https://www.kaggle.com/isaienkov/mechanisms-of-action-moa-prediction-eda)
* [Explorations of Action - MoA EDA](https://www.kaggle.com/headsortails/explorations-of-action-moa-eda)
* [MoA Prediction: Starter notebook](https://www.kaggle.com/ruchi798/moa-prediction-starter-notebook)
* [Drugs classification: Mechanisms of Action](https://www.kaggle.com/amiiiney/drugs-classification-mechanisms-of-action)
* [Mechanisms of Action (MoA) Prediction. EDA](https://www.kaggle.com/yutohisamatsu/mechanisms-of-action-moa-prediction-eda)
* [The MoA challenge- An analysis of the data](https://www.kaggle.com/bibhash123/the-moa-challenge-an-analysis-of-the-data)

# Your upvote will motivate me.

To be continued.....