In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly_express as px
import matplotlib.image as mpimg
from tabulate import tabulate
import missingno as msno 
from IPython.display import display_html
from PIL import Image
import gc
import cv2
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
train_features.head(5)

In [None]:
test_features.head(5)

In [None]:
train_targets_scored.head(5)

In [None]:

ax = train_targets_scored.drop('sig_id', axis=1) \
    .sum() \
    .sort_values(ascending=False) \
    .head(30) \
    .sort_values() \
    .plot(kind='barh',
         figsize=(15, 10)
          
         )
ax.set_title('Top 30 Scored Targets in Train Set', fontsize=20)
plt.show()


In [None]:
#THE DATASET CONTAINS 
g_cols = [c for c in train_features.columns if c[:2] == 'g-']
c_col = [c for c in train_features.columns if c[:2] == 'c-']

In [None]:
print('Number of gene columns:', len(g_cols))
print('Number of cell columns:', len(c_col))

In [None]:
# g- refers to the gene expression data
# c- refers to the cell viability data

In [None]:
#COUNTPLOT FOR CP_TYPE IN THE TRAIN DATASET
colors = ["#0101DF", "#DF0101"]
f, ax = plt.subplots(figsize=(12,5))
sns.countplot('cp_type', data=train_features, palette=colors )

In [None]:
#COUNTPLOT FOR CP_TYPE IN THE TEST DATASET
colors = ["#0101DF", "#DF0101"]
f, ax = plt.subplots(figsize=(12,5))
sns.countplot('cp_type', data=test_features, palette=colors )

In [None]:
from plotly.offline import init_notebook_mode,iplot

In [None]:
#COUNTPLOT FOR CP_TIME IN THE TRAIN DATASET REPRESENTED BY A PIE CHART

In [None]:
ds = train_features['cp_time'].value_counts().reset_index()
ds.columns = ['cp_time', 'count']
fig = px.pie(
    ds, 
    values='count', 
    names="cp_time", 
    title='cp_time for train dataset', 
    width=600, 
    height=500
)
fig.show()

In [None]:
ds = test_features['cp_time'].value_counts().reset_index()
ds.columns = ['cp_time', 'count']
fig = px.pie(
    ds, 
    values='count', 
    names="cp_time", 
    title='cp_time for test dataset', 
    width=600, 
    height=500
)
fig.show()

In [None]:

fig, axs = plt.subplots(1, 2, figsize=(15, 5))
ax = train_features['cp_dose'] \
    .value_counts() \
    .plot(kind='barh',
          figsize=(15, 3),
         ax=axs[0])
ax.set_title('training set', fontsize=15)

ax = test_features['cp_dose'] \
    .value_counts() \
    .plot(kind='barh',
          figsize=(15, 3),
         ax=axs[1])
ax.set_title('public test set', fontsize=15)
fig.suptitle('treatment dose', fontsize=20)
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
# from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC 
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encode = ['cp_type','cp_time','cp_dose']
for l in label_encode:
    le = LabelEncoder()
    train_features[f'{l}_le'] = le.fit_transform(train_features[l])
    test_features[f'{l}_le'] = le.transform(test_features[l])
    
FEATURES = g_cols + c_col + ['cp_type_le','cp_time_le','cp_dose_le']
TARGETS = [t for t in train_targets_scored.columns if t != 'sig_id']

X = train_features[FEATURES].values
X_test = test_features[FEATURES].values
y = train_targets_scored[TARGETS]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)
X_full = np.concatenate([X, X_test])

In [None]:
#APPLYING STANDARD SCALER

In [None]:
scale = StandardScaler()
scale.fit(X_full)
X_train = scale.transform(X_train)
X_val = scale.transform(X_val)
X_test = scale.transform(X_test)

In [None]:
#APPLYING PCA

In [None]:
pca = PCA(n_components=100, svd_solver='full')
pca.fit(X_full)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
clf = OneVsRestClassifier(SVC(probability=True))
clf.fit(X_train, y_train)
pred_train = clf.predict_proba(X_train)

In [None]:
pred_val = clf.predict_proba(X_val)
pred_test = clf.predict_proba(X_test)

In [None]:
sub = pd.DataFrame(pred_test, columns=TARGETS)
sub['sig_id'] = test_features['sig_id'].values

In [None]:
#WHY PCA ?
#1) DIEMENSIONALITY REDUCTION
#2) VISUALISING CLASSES

In [None]:
#PLEASE UPVOTE AFTER READING. THANK YOU