![550,500](https://images.pexels.com/photos/149387/pexels-photo-149387.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=800)

Normally for tabular dataset I would use decision trees , random forests , xgboost for a classification problem. How ever for this problem I decided to make a  neural network model using pytorch lightning. I wanted to get out of my comfort zone while making this project. Incase you want a explanation of the model building process I'd recommend checking out fast.ai course they do a better job of explaining than I do.

Also If you want to see the data visualised here's my [EDA](https://www.kaggle.com/aristotle609/eda-on-hr-dataset) since I won't be doing any EDA in this notebook

# Dependencies

In [None]:
#importing all the required dependencies
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime
import pytorch_lightning as pl

# Preprocessing Data

In [None]:
#lets take a look at the data
df = pd.read_csv("../input/hr-analytics-and-job-prediction/HR_comma_sep.csv")
print("Shape:", df.shape)
df.head()

In [None]:
label = "left"
cat_cols = ['Work_accident','number_project','promotion_last_5years','Department','salary']
num_cols = ['satisfaction_level', 'last_evaluation','average_montly_hours','time_spend_company']
print("Num of Categorical columns : " ,len(cat_cols))
print("Number of numerical columns : " , len(num_cols))

In [None]:
#splitting the data into train, val and test
test_size = 0.1
val_size = 0.3
random_state = 42

df_train , df_test = train_test_split(df,test_size = test_size,random_state = random_state,stratify = df[label])

df_train , df_val = train_test_split(df_train,test_size = val_size,random_state = random_state,stratify = df_train[label])

print("Shape:", df.shape)
print("Shape of train:", df_train.shape)
print("Shape of test:", df_test.shape)
print("Shape of validation:", df_val.shape)

# Numerating Categorical Columns

In [None]:
#numeralising the data
cat_code_dict = {}

for col in cat_cols:
    category_col = df[col].astype('category')
    cat_code_dict[col] = {value : idx for idx,value in enumerate(category_col.cat.categories)}
cat_code_dict

In [None]:
# since the numerical columns have been scaled there's no need to scale them again
def preprocess(df,cat_code_dict,num_cols,cat_cols,label_col):
    """
    df:DataFrame,
    cat_code_dict : A dictionary of categorial columns ,
    num_cols : the numerical columns,
    cat_cols : the categorical columns,
    label_col : the target column
    """
    df = df.copy()
    df[num_cols] = df[num_cols].astype(np.float32)
    
    for col in cat_cols:
        col_dict = cat_code_dict[col]
        df[col] = df [col].map(col_dict).astype(np.int64)
        df[label_col] = df[label_col].astype(np.int64)
    return df

In [None]:
#A look at the preprocessed data
preprocess(df,cat_code_dict,num_cols,cat_cols,label)

In [None]:
#preprocessing  all of the dataframes
df_train = preprocess(df_train,cat_code_dict,num_cols,cat_cols,label)
df_test = preprocess(df_test,cat_code_dict,num_cols,cat_cols,label)
df_val = preprocess(df_val,cat_code_dict,num_cols,cat_cols,label)
display(df_train,df_test,df_val)


In [None]:
#lets make the dataset
class TabularDataset(Dataset):
    def __init__(self,df,num_cols,cat_cols,label):
        """
        df: Dataframe passed,
        num_cols : Numerical Columns,
        cat_cols : Categorical_columns,
        label : target column
        """
        self.df = df 
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.label = label
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        num_array = self.df[self.num_cols].iloc[idx].values
        cat_array = self.df[self.cat_cols].iloc[idx].values
        label_array = self.df[self.label].iloc[idx]
        return num_array,cat_array,label_array

In [None]:
#checking the dataset for sanity
dataset = TabularDataset(df_train,num_cols,cat_cols,label)
dataloader = DataLoader(dataset,batch_size = 1 ,  shuffle = True)
next(iter(dataloader))

# Pytorch Lightning

In [None]:
#Making the Pytorch Lightning DataModule recommended by Pytorch Lightning it makes the data more convinient to use
class TabularDatsetModule(pl.LightningDataModule):
    def __init__(self,df_train,df_test,df_val,num_cols,cat_cols,label,test_batch = 64,train_batch = 64,val_batch = 64):
        """
        df_train : Train Dataframe
        df_test:test DataFrame
        df_val : Validation Dataframe
        num_cols : Numerical Columns
        cat_cols : Categorical Columns
        """
        super().__init__()
        self.train = df_train
        self.test = df_test
        self.val = df_val
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.label = label
        self.test_batch = test_batch
        self.train_batch = train_batch
        self.val_batch = val_batch
    
    def setup(self,stage):
        self.train_loader = TabularDataset(self.train,self.num_cols,self.cat_cols,self.label)
        self.test_loader = TabularDataset(self.test,self.num_cols,self.cat_cols,self.label)
        self.val_loader = TabularDataset(self.val,self.num_cols,self.cat_cols,self.label)
    
    """
    These return the data to the neural network --->
    """
    def train_dataloader(self):
        return DataLoader(self.train_loader,batch_size = self.train_batch,shuffle = True)
    
    def test_dataloader(self):
        return DataLoader(self.test_loader,batch_size = self.test_batch,shuffle = True)
    
    def val_dataloader(self):
        return DataLoader(self.val_loader,batch_size = self.val_batch,shuffle = True)
    

In [None]:
#Making the neural network
class TabularNet(pl.LightningModule):
    def __init__(self,num_cols,cat_cols,embedding_size_dict,n_classes,embedding_dim_dict = None):
        """
        num_cols: A list of the numerical columns
        cat_cols: A list of cat_cols,
        embedding_size_dict :  A dictionary of th columns and the number of categories,
        n_classes = number of classes,
        embedding_dim_dict: A dictionary of th columns and the dimensions of the ouput embedding
        """
        super().__init__()
        self.embeddings , total_embeddings_dim = self._create_embedding_layers(cat_cols,embedding_size_dict,embedding_dim_dict)
        in_features = len(num_cols) + total_embeddings_dim
        self.layers = nn.Sequential(
        nn.Linear(in_features,128),
            nn.ReLU(),
            nn.Linear(128,100),
            nn.ReLU(),
            nn.Linear(100,n_classes),
        )
    @staticmethod
    def _create_embedding_layers(cat_cols,embedding_size_dict,embeddind_dim_dict):
        total_embedding_dim = 0
        embedding_dim = 0
        embeddings = {}
        for col in cat_cols:
            embedding_size = embedding_size_dict[col]
            embedding_dim = embedding_dim_dict[col]
            total_embedding_dim +=embedding_dim
            embeddings[col] = nn.Embedding(embedding_size,embedding_dim)
            
        return nn.ModuleDict(embeddings),total_embedding_dim
        
    def forward(self,num_tensor,cat_tensor):
        cat_outputs = []
        for i,col in enumerate(cat_cols):
            embedding = self.embeddings[col]
            cat_output = embedding(cat_tensor[:,i])
            cat_outputs.append(cat_output)
        cat_outputs = torch.cat(cat_outputs,dim = 1)
        all_outputs = torch.cat([num_tensor,cat_outputs],dim = 1)
        final_output = self.layers(all_outputs).squeeze(dim=-1)
        return final_output
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
        
    def training_step(self,batch,batch_idx):
        num_tensor,cat_tensor,target = batch
        y_pred = self(num_tensor,cat_tensor)
        loss = F.cross_entropy(y_pred,target)
        return loss
    def val_step(self,batch,batch_idx):
        num_tensor,cat_tensor,target = batch
        y_pred = self(num_tensor,cat_tensor)
        loss = F.cross_entropy(y_pred,target)
        return loss
    def test_step(self,batch,batch_idx):
        num_tensor,cat_tensor,target = batch
        y_pred = self(num_tensor,cat_tensor)
        loss = F.cross_entropy(y_pred,target)
        self.log("Loss:",loss)
        return loss

In [None]:
"""
This to determine the size of each embedding dimension check out fast.ai course to get a better understanding of this method
"""
n_classes = 2
embedding_size_dict = {col: len(code) for col, code in cat_code_dict.items()}
embedding_dim_dict = {col: embedding_size // 2 for col, embedding_size in embedding_size_dict.items()}
embedding_dim_dict

In [None]:
tabular_data_module = TabularDatsetModule(df_train,df_test,df_val, num_cols, cat_cols, label)

# we can print out the network architecture for inspection
tabular_model = TabularNet(num_cols, cat_cols, embedding_size_dict, n_classes, embedding_dim_dict)
tabular_model

In [None]:
%%time
trainer = pl.Trainer(max_epochs=1000)#chose the epochs wisely on a kaggle server 1 epoch takes 17s 
trainer.fit(tabular_model, tabular_data_module)

In [None]:
trainer.test()

Good Enough