In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.decomposition import PCA
from collections import Counter
import sys
sys.path.append('../input/iterativestratification')

import matplotlib.pyplot as plt
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import plotly.express as px
plt.rcParams['figure.figsize']=(12,5)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

def file_appender(x):
    return os.path.join(dirname,x)
training_data=pd.read_csv(file_appender('train_features.csv'))
testing_data=pd.read_csv(file_appender('test_features.csv'))
labels_train=pd.read_csv(file_appender('train_targets_scored.csv'))
labels_extra=pd.read_csv(file_appender('train_targets_nonscored.csv'))
submission_sample=pd.read_csv(file_appender('sample_submission.csv'))

genes=[x for x in training_data.columns if x.startswith('g-')]
cells=[x for x in training_data.columns if x.startswith('c-')]



**Lets perform PCA on the Cells Data first to see how much of data is preserved via it.**

In [None]:
pca=PCA(n_components=4)
pca_result=pca.fit_transform(training_data[cells].values)
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    pca_result,
    labels=labels,
    dimensions=range(4),
    title="TOTAL EXPLAINED VARIANCE CELLS: "+str(sum(pca.explained_variance_ratio_)*100)
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
pca=PCA(n_components=4)
pca_result=pca.fit_transform(training_data[genes].values)
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    pca_result,
    labels=labels,
    dimensions=range(4),
    title="TOTAL EXPLAINED VARIANCE GENES: "+str(sum(pca.explained_variance_ratio_)*100)
)
fig.update_traces(diagonal_visible=False)
fig.show()

How much Labels on training_data

In [None]:
res=np.array(Counter(labels_train[[x for x in labels_train.columns if x!='sig_id']].sum(axis=1)).most_common())
plt.bar(res[:,0], res[:,1], align='center', alpha=0.5)
plt.xticks(res[:,0])
plt.ylabel('no of sample')
plt.title('How many multi-labels')

In [None]:
most_common_label=labels_train[[x for x in labels_train.columns if x!='sig_id']].sum(axis=0)
indexes=list(most_common_label.index)
values=list(most_common_label.values)

data_dict={}
data_dict['label']=indexes
data_dict['values']=values
most_common=pd.DataFrame(data_dict).sort_values(by='values', ascending=False)
plt.figure(figsize=(20,5))
plt.bar(most_common['label'][:50], most_common['values'][:50], align='center', alpha=0.5)
plt.xticks(most_common['label'][:50], rotation='vertical', fontsize=15)
plt.ylabel('Count')
plt.title('Most Popular label')


In [None]:
most_common_label=labels_train[[x for x in labels_train.columns if x!='sig_id']].sum(axis=0)
indexes=list(most_common_label.index)
values=list(most_common_label.values)

data_dict={}
data_dict['label']=indexes
data_dict['values']=values
plt.figure(figsize=(20,5))

most_common=pd.DataFrame(data_dict).sort_values(by='values', ascending=True)
plt.bar(most_common['label'][:50], most_common['values'][:50], align='center', alpha=0.5)
plt.xticks(most_common['label'][:50], rotation='vertical', fontsize=15)
plt.ylabel('Count')

plt.title('Most UNPopular label')

In [None]:
list_of_potentials=['cp_type','cp_time','cp_dose']
def make_dummies(input_data):
    result_list=[input_data]
    for x in list_of_potentials:
        result_list.append(pd.get_dummies(input_data[x]))
    return pd.concat(result_list, axis=1)
training_data_for_model=make_dummies(training_data)
testing_data_for_model=make_dummies(testing_data)
for x in list_of_potentials:
    training_data_for_model=training_data_for_model.drop(x, axis=1)
    testing_data_for_model=testing_data_for_model.drop(x,axis=1)

In [None]:
len(training_data.columns)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
useful_columns=[x for x in list(training_data_for_model.columns) if x!='sig_id']
columns_labels=[x for x in list(labels_train.columns) if x!='sig_id']
label_dict={}
for x in columns_labels:
    label_dict[x]={}

In [None]:
from sklearn.model_selection import KFold
X=training_data_for_model[useful_columns].values
y=labels_train[columns_labels].values
kf1 = KFold(n_splits=5)


In [None]:
def score_calculator(predicted, true_values):
    a=(true_values*np.log(predicted))+((1-true_values)*(np.log(1-predicted)))
    a=a/a.shape[0]
    b=np.sum(a, axis=0)
    return -1*b
    print (b)
    return sum(b)

In [None]:
print ("Starting now...")
tester=[]
tester_true=[]
for train_index, test_index in kf1.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print ("X_train: "+str(X_train.shape)+"  Xtest: "+str(X_test.shape))
    print ("y_train: "+str(y_train.shape)+"  ytest: "+str(y_test.shape))
    predicted_r=[]
    predicted_p=[]
    actual=[]
    
    
    
    
    for i,x in enumerate(columns_labels):
        print ("Now processing "+str(x)+" Which is "+str(i+1)+ "of "+str(len(columns_labels)))
        y=labels_train[x]
        y=y.values
        y_train,y_test=y[train_index],y[test_index]
        model=LogisticRegression(max_iter=1000)
        issue=False
        try:
            model.fit(X_train,y_train)
            predicted_probability=model.predict_proba(X_test)
            predicted=model.predict(X_test)
            predicted_r.append(predicted)
            predicted_p.append(predicted_probability)
            actual.append(y_test)
            label_dict[x]=model
            issue=False
        except Exception as e:
            issue=True
            print (e)
            
    
    print ("TOTAL LOG LOSS")
    #log_losses=[log_loss(actual[i], predicted_p[i]) for i in range(0, len(predicted_p))]
    #total_logloss_lib=sum(log_losses)

        
    log_losses_manual=[score_calculator(predicted_r[i], actual[i]) for i in range(0, len(predicted_r))]
    total_logloss_manual=sum(log_losses_manual)

        #print ("total logloss: "+str(total_logloss_lib))
    print ("total logloss_manual: "+str(total_logloss_manual))
    break

In [None]:
X_send=testing_data_for_model[useful_columns].values

Predicted_dic={}
for x in label_dict:
    try:
        model=label_dict[x]
        result=model.predict(X_send)
        Predicted_dic[x]=result
    except:
        Predicted_dic[x]=[0 for x in range(0, len(X_send))]
        print ("error predicting 0 on all")

In [None]:
for x in Predicted_dic:
    submission_sample[x]=list(Predicted_dic[x])

In [None]:
submission_sample.to_csv('submission.csv', index=False)