# MOA Basic Fast.ai

 - Using learnings from chapter 01 of fastbook
 - Additional Resources:
  - https://docs.fast.ai/tabular.learner.html#TabularLearner.predict
  - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html
  - https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

# Imports

In [None]:
import os
from tqdm.notebook import tqdm
import pandas as pd
from fastai.vision.all import *
from fastai.tabular.all import *

# Define Features & Prep Data for Fast.ai

In [None]:
%%time

### add target values to training data
# read data
df = pd.read_csv("../input/lish-moa/train_features.csv")
print("training data shape: ", df.values.shape)
targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
print("target data shape: ", targets.values.shape)
# get target columns
target_list = targets.columns.tolist()
target_list.remove('sig_id')
print("total target categories: ",len(target_list))
# create a new variable called label with index value as label
def randargmax(b,**kw):
  """ a random tie-breaking argmax"""
  return np.argmax(np.random.random(b.shape) * (b==b.max()), **kw)
targets['label'] = randargmax(targets[target_list].values, axis=1)
# convert label to string
targets['label'] = targets['label'].apply(lambda x: "class-"+str(x)).astype(str)
# merge labels to training data
df2 = targets[['label']].join(df)
print("merged data shape: ", df2.values.shape)


targets with very low volume mess things up, so let's not model them for now

In [None]:
df2[['label','sig_id']].groupby('label').count().reset_index().sort_values(by='sig_id').head(10)

In [None]:
# move low classes to another class (quick and dirty hack)
low_classes = ['class-82', 'class-34', 'class-141', 'class-12', 'class-125']
df2['label'] = df2['label'].apply(lambda x: 'class-125' if x in low_classes else x )

In [None]:
len(set(df2['label']))

Save new training data to disk for Fast.ai

In [None]:
# save to disk
df2.to_csv("training_data.csv", index=False)
# show results
df2.head()

Save the names of continuous and categorical features for Fast.ai

In [None]:
# modeling feature names
con_features_list = df.columns.tolist()
con_features_list.remove('sig_id')
con_features_list.remove('cp_type')
con_features_list.remove('cp_dose')
cat_features_list = ['cp_type','cp_dose']

Get indexs of targets that are being used in model, which we need for submission

In [None]:
target_indexs = list([int(x.split('-')[-1]) for x in set(df2['label'])])
target_indexs.sort()
print(target_indexs[:5])
print(len(target_indexs))

# Modeling

In [None]:
path = "./training_data.csv"

dls = TabularDataLoaders.from_csv(path, path=path, y_names="label",
    cat_names = cat_features_list,
    cont_names = con_features_list,
    procs = [Categorify, FillMissing, Normalize]
)

In [None]:
learn = tabular_learner(dls, metrics=accuracy)

In [None]:
%%time

learn.fit_one_cycle(4)

# Submission

In [None]:
test = pd.read_csv("../input/lish-moa/test_features.csv")
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")

In [None]:
target_indexs_f = [i + 1 for i in target_indexs]
print(len(target_indexs_f))

In [None]:
len(sub.iloc[0,target_indexs_f])

In [None]:
%%time

for index, row in tqdm(sub.iterrows()):
    row, clas, probs = learn.predict(test.iloc[index])
    sub.iloc[index,target_indexs_f] = probs.numpy()

In [None]:
sub.to_csv("submission.csv", index=False)
sub.head()

In [None]:
len(probs)