In [None]:
# Import Libs
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Load dataset
df = pd.read_csv('../input/doctor-fees/doctor_fees.csv')
df.head()

In [None]:
# Data Engineering
df['highest_qual']=[x[0] for x in df.Qualification.str.replace('[^\w\s]','').str.split()]
df['exp_yr']=[int(x[0]) for x in df.Experience.str.replace('[^\w\s]','').str.split()]
df['region']=['Unknown' if isinstance(x,float) else x[-1] for x in df.Place.str.split(pat=',')]
df['rte']=[None if isinstance(x,float) else float(x[0]) for x in df.Rating.str.split('%')]

In [None]:
# assing ave rating if missing
mean_rte = df.rte.mean(skipna=True)
df['rte']=[mean_rte if np.isnan(x) else x for x in df.rte]

In [None]:
df = df[['highest_qual', 'region', 'Profile', 'exp_yr', 'rte', 'Fees']]
df.head()

In [None]:
df['Fees'].describe()

In [None]:
df.columns

In [None]:
# Separate categorical from continuous columns
cat_cols = ['highest_qual', 'region', 'Profile']
cont_cols = ['exp_yr', 'rte']
y_col = ['Fees']  # this column contains the labels

In [None]:
# Convert our three categorical columns to category dtypes.
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [None]:
df.dtypes

In [None]:
highest_qual = df['highest_qual'].cat.codes.values
region = df['region'].cat.codes.values
Profile = df['Profile'].cat.codes.values

cats = np.stack([highest_qual, region, Profile], 1)

cats[:5]

In [None]:
# Convert categorical variables to a tensor
cats = torch.tensor(cats, dtype=torch.int64) 
# this syntax is ok, since the source data is an array, not an existing tensor

cats[:5]

In [None]:
# Convert continuous variables to a tensor
conts = np.stack([df[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
conts[:5]

In [None]:
# Convert labels to a tensor
y = torch.tensor(df[y_col].values, dtype=torch.float).reshape(-1,1)

y[:5]

In [None]:
print(cats.shape)
print(conts.shape)
print(y.shape)

In [None]:
# This will set embedding sizes for our categorical data
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

In [None]:
# Define a TabularModel
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont

        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))

        self.layers = nn.Sequential(*layerlist)

    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)

        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(emb_szs, conts.shape[1], 1, [200,100], p=0.4)

In [None]:
model

In [None]:
# Define loss function & optimizer
criterion = nn.MSELoss()  # we'll convert this to RMSE later
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Perform test/train splits
batch_size = int(len(df)/2)
test_size = int(batch_size * .1)

cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [None]:
print(len(cat_train))
print(len(cat_test))

In [None]:
# train the model
import time
start_time = time.time()

epochs = 2000
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = torch.sqrt(criterion(y_pred, y_train)) # RMSE
    losses.append(loss)
    
    # a neat trick to save screen space:
    if i%100 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
# plot the loss function
plt.plot(range(epochs), losses)
plt.ylabel('RMSE Loss')
plt.xlabel('epoch');

In [None]:
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = torch.sqrt(criterion(y_val, y_test))
print(f'RMSE: {loss:.8f}')

In [None]:
print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
for i in range(50):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

In [None]:
# Make predictions
# Load dataset
df_eval = pd.read_csv('../input/doctor-fees/doctor_fees_eval.csv')
# Data Engineering
df_eval['highest_qual']=[x[0] for x in df_eval.Qualification.str.replace('[^\w\s]','').str.split()]
df_eval['exp_yr']=[int(x[0]) for x in df_eval.Experience.str.replace('[^\w\s]','').str.split()]
df_eval['region']=['Unknown' if isinstance(x,float) else x[-1] for x in df_eval.Place.str.split(pat=',')]
df_eval['rte']=[None if isinstance(x,float) else float(x[0]) for x in df_eval.Rating.str.split('%')]
# assing ave rating if missing
df_eval['rte']=[mean_rte if np.isnan(x) else x for x in df_eval.rte]
df_eval = df_eval[['highest_qual', 'region', 'Profile', 'exp_yr', 'rte']]
# Convert our three categorical columns to category dtypes.
for cat in cat_cols:
    df_eval[cat] = df_eval[cat].astype('category')
df_eval.head()

In [None]:
# CREATE CAT AND CONT TENSORS
xcats = np.stack([df_eval[col].cat.codes.values for col in cat_cols], 1)
xcats = torch.tensor(xcats, dtype=torch.int64)
xconts = np.stack([df_eval[col].values for col in cont_cols], 1)
xconts = torch.tensor(xconts, dtype=torch.float)
# PASS NEW DATA THROUGH THE MODEL WITHOUT PERFORMING A BACKPROP
with torch.no_grad():
    z = model(xcats, xconts)

In [None]:
df_submit = pd.DataFrame({'Fees':[int(x) for x in z]})
df_submit.to_csv('submission.csv',index=None)