# EDA  

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import lightgbm as lgb
%matplotlib inline

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
train = pd.read_csv("../input/lish-moa/train_features.csv")
test = pd.read_csv("../input/lish-moa/test_features.csv")
train_targets_scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")
GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]

# StandardScalerとRank Gaussの違い

In [None]:
#変換前
plt.hist(x=train[GENES[0]],bins=50)
plt.title("before transform")
plt.show()

In [None]:
#RankGauss
#https://www.kaggle.com/kushal1506/moa-pytorch-0-01859-rankgauss-pca-nn
for col in (GENES + CELLS):

    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
plt.hist(x=train[GENES[0]],bins=50)
plt.title("Rank Gauss transform")
plt.show()

In [None]:
#StandardScaler
#読み直し
train = pd.read_csv("../input/lish-moa/train_features.csv")
test = pd.read_csv("../input/lish-moa/test_features.csv")
for col in (GENES + CELLS):

    transformer = StandardScaler()
    vec_len = len(train[col].values)
    vec_len_test = len(test[col].values)
    raw_vec = train[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test[col] = transformer.transform(test[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
plt.hist(x=train[GENES[0]],bins=50)
plt.title("StandardScaler transform")
plt.show()

In [None]:
def label_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_cols):
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    for f in encode_cols:
        try:
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
        except:
            print(f)
    test = train[n_train:].reset_index(drop=True)
    train = train[:n_train]
    return train, test

In [None]:
train.shape

In [None]:
train.head()

In [None]:
labels = train_targets_scored.copy()

In [None]:
res = train["cp_type"]=="trt_cp"
train = train[res].reset_index(drop=True)
labels = labels[res].reset_index(drop=True)

In [None]:
len(labels)

In [None]:
ans = 0
for i in range(len(labels)):
    if all(i== 0 for i in labels.iloc[i,1:]):
        ans += 1

print(ans)

In [None]:
res = train[["g-0","g-1","g-2","c-0","c-1","c-2"]]
res.describe()

In [None]:
print(train["cp_type"].unique())
print(train["cp_time"].unique())
print(train["cp_dose"].unique())

In [None]:
train_targets_scored.head()

In [None]:
sub.head()

In [None]:
res = pd.concat([train,train_targets_scored],axis=1)
res = res[res["cp_type"]=="ctl_vehicle"]
res = res[train_targets_scored.columns]
res.describe()

In [None]:
#cp_type == ctl_vehicleだった時に本当に1となる値が無いことを確認
al = 0
for i in res.columns:
    if i == "sig_id":continue
    al += res[i].sum()
print(al)

In [None]:
res = pd.concat([train,train_targets_scored],axis=1)
res = res[res["cp_type"]=="trt_cp"]
res = res[train_targets_scored.columns]
res.describe()

"cp_type"=="ctl_vehicle"であるものは正解ラベルに1が立っているものが1つもない(これが、"コントロール摂動は作用機序を持たない"の意味であると考えられる)
そもそも、陽正解ラベルを持たないのであればデータから弾いてしまうのも手だが(例えば、"cp_type"=="ctl_vehicle"である時点で100%全てが0になる特徴量であると言えるので、無意味)、"cp_type"を隠して学習させることで、何らかの特徴を掴む可能性はありそう？

In [None]:
ret = []
for i in train_targets_scored.columns:
    if i == "sig_id":continue
    ret.append(train_targets_scored[i].sum())
print(*ret)
print(max(ret),min(ret),sum(ret)/len(ret))

In [None]:
test.head()

# Quantile Norm

In [None]:
df = train.loc[[i for i in range(5)],["g-0","g-1","g-2"]]

In [None]:
df

In [None]:
#順番を保持
df2 = df.copy()
for i in range(5):
    df2["g-0"][i] = str((i,df["g-0"][i]))
    df2["g-1"][i] = str((i,df["g-1"][i]))
    df2["g-2"][i] = str((i,df["g-2"][i]))

In [None]:
df2

In [None]:
#2つめの要素で昇順にソート
lst = [[] for _ in range(3)]
for i in range(5):
    lst[0].append((i,df["g-0"][i]))
    lst[1].append((i,df["g-1"][i]))
    lst[2].append((i,df["g-2"][i]))

In [None]:
lst

In [None]:
from operator import itemgetter
for i in range(3):
    lst[i].sort(key=itemgetter(1))

In [None]:
lst

In [None]:
#順番を保持
df2 = df.copy()
for i in range(5):
    df2["g-0"][i] = str(lst[0][i])
    df2["g-1"][i] = str(lst[1][i])
    df2["g-2"][i] = str(lst[2][i])

In [None]:
df2

In [None]:
df2["mean"] = 0.
for i in range(5):
    df2["mean"][i] = sum([lst[j][i][1] for j in range(3)])

In [None]:
df2

In [None]:
df3 = df2.copy()
lst2 = [[] for _ in range(3)]
for i in range(5):
    df3["g-0"][i] = str((lst[0][i][0],df2["mean"][i].round(4)))
    df3["g-1"][i] = str((lst[1][i][0],df2["mean"][i].round(4)))
    df3["g-2"][i] = str((lst[2][i][0],df2["mean"][i].round(4)))
    
    lst2[0].append((lst[0][i][0],df2["mean"][i].round(4)))
    lst2[1].append((lst[1][i][0],df2["mean"][i].round(4)))
    lst2[2].append((lst[2][i][0],df2["mean"][i].round(4)))
df3

In [None]:
lst2


In [None]:
for i in range(3):
    lst2[i].sort()

df2 = df.copy()
for i in range(5):
    df2["g-0"][i] = str(lst2[0][i])
    df2["g-1"][i] = str(lst2[1][i])
    df2["g-2"][i] = str(lst2[2][i])

In [None]:
df2

In [None]:
df2 = df.copy()
for i in range(5):
    df2["g-0"][i] = lst2[0][i][1]
    df2["g-1"][i] = lst2[1][i][1]
    df2["g-2"][i] = lst2[2][i][1]
df2