<a href="https://colab.research.google.com/github/prithvijaunjale/Machine-Learning/blob/master/multivariate_classification/mvc_fastai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from fastai.tabular import *
import pandas as pd
import numpy as np
import os
from collections import Counter

In [None]:
!unzip doctor_train.csv.zip

Archive:  doctor_train.csv.zip
  inflating: doctor_train.csv        


In [None]:
path = os.getcwd()
df = pd.read_csv('doctor_train.csv')
df = df.drop(columns=['ID', 'Doctor_visits', 'day'])
df = df.dropna()

In [None]:
dep_var = 'Y'
cat_names = ['Profession', 'Status', 'edu', 'communication', 'Month', 'side_effects']
cont_names = [x for x in df.columns if not cat_names and x != 'Y']
procs = [FillMissing, Categorify, Normalize]

In [None]:
split_idx = df.shape[0] - int(df.shape[0] * 0.2)
train_df = df.iloc[:split_idx, :]
test_df = df.iloc[split_idx:, :]

In [None]:
train_df['Y'].value_counts()

no     18065
yes     2573
Name: Y, dtype: int64

In [None]:
OVERSAMPLE = False

if OVERSAMPLE:
    X_train = train_df[[x for x in train_df.columns if x != 'Y']].values
    y_train = train_df['Y'].values

    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler(sampling_strategy='minority', random_state=0)
    X_over, y_over = oversample.fit_resample(X_train, y_train)

    train_df = pd.DataFrame(X_over, columns=[x for x in train_df.columns if x != 'Y'])
    train_df['Y'] = y_over

In [None]:
train_df['Y'].value_counts()

no     18065
yes     2573
Name: Y, dtype: int64

In [None]:
valid_idx = train_df.shape[0] - int(train_df.shape[0] * 0.15)
valid_idx = range(valid_idx, len(train_df))
train_df.shape, valid_idx, test_df.shape

((20638, 15), range(17543, 20638), (5159, 15))

In [None]:
test = pd.read_csv('doctor_test.csv')
test = test.drop(columns=['ID', 'Doctor_visits', 'day'])
test.head()

Unnamed: 0,age,Profession,Status,edu,Irregular,Money,residence,prev_diagnosed,communication,Month,Time,last_visit,cured_in,side_effects
0,42.0,blue-collar,married,primary,no,188.0,yes,no,unknown,may,53,-1,0,unknown
1,37.0,management,married,tertiary,no,2283.0,no,no,cellular,aug,303,92,4,success
2,27.0,blue-collar,married,secondary,no,1341.0,yes,yes,unknown,may,254,-1,0,unknown
3,56.0,admin.,single,secondary,no,2815.0,yes,no,cellular,sep,212,-1,0,unknown
4,32.0,self-employed,single,secondary,no,123.0,yes,no,unknown,may,278,-1,0,unknown


In [None]:
data = (TabularList.from_df(train_df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                   .split_by_idx(valid_idx)
                   .label_from_df(cols=dep_var)
                   .add_test(TabularList.from_df(test, path=path, cat_names=cat_names, cont_names=cont_names))
                   .databunch())

In [None]:
data.show_batch(rows=5)

Profession,Status,edu,communication,Month,side_effects,target
technician,married,secondary,telephone,jun,unknown,yes
services,married,secondary,unknown,sep,unknown,no
services,divorced,secondary,unknown,may,unknown,no
technician,married,tertiary,cellular,jan,failure,no
entrepreneur,single,tertiary,cellular,jan,failure,no


In [None]:
learn = tabular_learner(data, layers=[200, 100], metrics=accuracy)
learn.fit(10, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.331915,0.326155,0.884653,00:03
1,0.324771,0.323387,0.889176,00:03
2,0.32633,0.323332,0.885622,00:03
3,0.316403,0.317339,0.882391,00:03
4,0.311375,0.318585,0.886914,00:03
5,0.307118,0.315017,0.886914,00:03
6,0.324547,0.31499,0.887884,00:03
7,0.314525,0.315043,0.885945,00:03
8,0.304511,0.31516,0.882714,00:03
9,0.316794,0.314309,0.883037,00:03


In [None]:
preds, y = learn.get_preds(ds_type = DatasetType.Test)
y_pred = [np.argmax(x) for x in preds.tolist()]
y_true = list(test_df['Y'].replace({'no':0, 'yes':1}).values)

from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(y_true, y_pred)
# f1_score(y_true, y_pred)

array([[4464,   88],
       [ 491,  116]])

In [None]:
test = load_data('doctor_test.csv')
submission = pd.DataFrame()
submission['ID'] = range(0, len(test))
y_pred = model.predict(test)
pred = []
for item in y_pred:
    if item >= 0.5:
        pred.append(1)
    else:
        pred.append(0)
submission['Y'] = pred
submission['Y'] = submission['Y'].replace({0:'no', 1:'yes'})
submission.head(50)