# Otto group product classification challenge

In [17]:
# read in the train and test data files 
import pandas as pd
import torch
import numpy
from torch import nn, optim
train = pd.read_csv("D://Pytorch//data//Otto Group//train.csv")
tobescored = pd.read_csv("D://Pytorch//data//Otto Group//test.csv")
train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [2]:
# drop the id 
id_col = 'id'
train_data = train.drop(id_col, axis=1)
# convert target string to numeric
new= train_data["target"].str.split("_", n = 1, expand = True)
#drop the target column
train_data.drop(columns =["target"], inplace = True) 
train_data["target"] = new[1]
train_data["target"]=train_data["target"].astype('float64')

# get split of different classes in target 
train_data.target.value_counts()

2.0    16122
6.0    14135
8.0     8464
3.0     8004
9.0     4955
7.0     2839
5.0     2739
4.0     2691
1.0     1929
Name: target, dtype: int64

In [3]:
# split train into validation and training
from sklearn.model_selection import train_test_split
y = train_data.target
X = train_data.drop('target', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [4]:
# now using pytorch make dataset class for ottogroup
from torch.utils.data import Dataset, DataLoader

class ottogroup(Dataset):
    def __init__(self,data_x,data_y = None):
        self.x = data_x
        self.y = data_y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,index):
        data = self.x.iloc[index]
        data = torch.from_numpy(numpy.asarray(data))
        if self.y is not None:
            return data, torch.tensor(self.y.iloc[index])
        else:
            return data
        
train_dataset = ottogroup(X_train,y_train)
valid_dataset = ottogroup(X_test,y_test)

In [6]:
# create a dataloader object to feed into the 
training_loader = DataLoader(dataset = train_dataset,
                     shuffle=True,
                     batch_size= 64)
validation_loader = DataLoader(dataset=valid_dataset,
                       shuffle = False,
                       batch_size = 64)


In [7]:
# define the nn model 
class model(nn.Module):
    def __init__(self):
        super(model,self).__init__()
        self.l1 = nn.Linear(93,64)
        self.l2 = nn.Linear(64,32)
        self.l3 = nn.Linear(32,16)
        self.l4 = nn.Linear(16,9)
        
    def forward(self,x):
        o1 = nn.functional.relu(self.l1(x))
        o2 = nn.functional.relu(self.l2(o1))
        o3 = nn.functional.relu(self.l3(o2))
        o4 = nn.functional.relu(self.l4(o3))
        return(o4)

In [8]:
# Call model object, loss criterion and optimizer

otto_model = model()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params= otto_model.parameters(),
                     lr = 0.01,
                     momentum= 0.5)


In [9]:
# define the training function 
def train(loader,epoch):
    otto_model.train()
    for i , (inp,target) in enumerate(loader):
        inp,target = torch.autograd.Variable(inp).float(),torch.autograd.Variable(target).type(torch.LongTensor)
        optimizer.zero_grad()
        pred = otto_model(inp)
        loss = criterion(pred,target-1)
        loss.backward()
        optimizer.step()
        if i % 200 == 0 :
            print("Train epoch: {} | Batch status: {}/{} ({:.6f}%) | Error: {:.6f} ".format(
            epoch,i*len(inp),len(loader.dataset),
                100.*i / len(loader),loss.item()
            ))

In [10]:
def test(epoch,loader):
    otto_model.eval()
    correct = 0
    total_error = 0
    for data,target in loader:
        data,target = torch.autograd.Variable(data).float(), torch.autograd.Variable(target).type(torch.LongTensor)
        out = otto_model(data)
        # sum up the total loss 
        loss = criterion(out,target-1)
        total_error = total_error + loss.item()
        # get index of the max_log_probability 
        pred = torch.max(out.data,1)[1]
        correct = correct + pred.eq((target-1).data.view_as(pred)).cpu().sum()
        
    total_error = total_error/len(loader.dataset)
    
    print(f'===========================\nTest set: Average loss: {total_error:.4f}, Accuracy: {correct}/{len(loader.dataset)}'
          f'({100. * correct / len(loader.dataset):.0f}%)')

In [77]:
# train model to check performance
for epoch in range(10):
    train(epoch = epoch,loader = training_loader)
    test(epoch = epoch,loader = validation_loader)

Train epoch: 0 | Batch status: 0/49502 (0.000000%) | Error: 0.669132 
Train epoch: 0 | Batch status: 12800/49502 (0.404024%) | Error: 0.604915 
Train epoch: 0 | Batch status: 25600/49502 (0.808048%) | Error: 0.637512 
Train epoch: 0 | Batch status: 38400/49502 (1.212072%) | Error: 0.520309 
Test set: Average loss: 0.0111, Accuracy: 9785/12376(79%)
Train epoch: 1 | Batch status: 0/49502 (0.000000%) | Error: 0.548592 
Train epoch: 1 | Batch status: 12800/49502 (0.404024%) | Error: 0.462640 
Train epoch: 1 | Batch status: 25600/49502 (0.808048%) | Error: 0.604782 
Train epoch: 1 | Batch status: 38400/49502 (1.212072%) | Error: 0.488063 
Test set: Average loss: 0.0113, Accuracy: 9723/12376(79%)
Train epoch: 2 | Batch status: 0/49502 (0.000000%) | Error: 0.607159 
Train epoch: 2 | Batch status: 12800/49502 (0.404024%) | Error: 0.623868 
Train epoch: 2 | Batch status: 25600/49502 (0.808048%) | Error: 0.506175 
Train epoch: 2 | Batch status: 38400/49502 (1.212072%) | Error: 0.438252 
Test set

In [11]:
# train with the entire dataset 
y = train_data.target
x = train_data.drop('target', axis=1)

# make the data_loader
full_dataset = ottogroup(x,y)
full_dataloader = DataLoader(dataset=full_dataset,
                             batch_size=64,shuffle=True)

for i in range(10):
    train(epoch = i , loader = full_dataloader)


Train epoch: 0 | Batch status: 0/61878 (0.000000%) | Error: 2.223327 
Train epoch: 0 | Batch status: 12800/61878 (20.682523%) | Error: 1.982678 
Train epoch: 0 | Batch status: 25600/61878 (41.365047%) | Error: 1.472324 
Train epoch: 0 | Batch status: 38400/61878 (62.047570%) | Error: 1.531376 
Train epoch: 0 | Batch status: 51200/61878 (82.730093%) | Error: 1.416657 
Train epoch: 1 | Batch status: 0/61878 (0.000000%) | Error: 1.387109 
Train epoch: 1 | Batch status: 12800/61878 (20.682523%) | Error: 1.292746 
Train epoch: 1 | Batch status: 25600/61878 (41.365047%) | Error: 1.249649 
Train epoch: 1 | Batch status: 38400/61878 (62.047570%) | Error: 1.309188 
Train epoch: 1 | Batch status: 51200/61878 (82.730093%) | Error: 1.146706 
Train epoch: 2 | Batch status: 0/61878 (0.000000%) | Error: 0.876760 
Train epoch: 2 | Batch status: 12800/61878 (20.682523%) | Error: 0.676456 
Train epoch: 2 | Batch status: 25600/61878 (41.365047%) | Error: 0.639554 
Train epoch: 2 | Batch status: 38400/618

In [209]:
# make predictions for the test dataset 
# drop the id 
id_col = tobescored['id']
tobescored_torch = torch.from_numpy(tobescored.values)
colnames = ['id','Class_1','Class_2','Class_3','Class_4','Class_5',
             'Class_6','Class_7','Class_8','Class_9']
result = pd.DataFrame(0,columns=colnames,index=numpy.arange(tobescored.shape[0]))
result['id'] = tobescored['id']
result1 = result.copy() 

In [221]:
# make predictions 
result1 = result 
for i in range(tobescored.shape[0]):
    output = otto_model(tobescored_torch[i,1:].float())
    pred = numpy.around(torch.nn.functional.softmax(output,dim = -1).data.numpy(),decimals = 1)
    prod_type = torch.nn.functional.softmax(output,dim = -1).data.numpy().argmax()
    result.iat[i,prod_type+1] = 1
    result1.loc[i,1:] = pred
    if(i % 10000 == 0 ):
        print(i)
    
    

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000


In [222]:
result1.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0.0,0.4,0.4,0.2,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.6,0.0
2,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4,0.0,0.4,0.6,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4


In [223]:
result.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
0,1,0.0,0.4,0.4,0.2,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.6,0.0
2,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4,0.0,0.4,0.6,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4


In [224]:
result.to_csv (r'D:\Pytorch\data\Otto Group\final_result.csv', index = None, header=True)
result1.to_csv (r'D:\Pytorch\data\Otto Group\final_result_prob.csv', index = None, header=True)