In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
#base dataset
data = pd.read_csv("data-final.csv")
#remove entries at IP addresses from which multiple responses were sent
data = data[data['IPC'] == 1].dropna()
#remove country-less entries
data = data[data['country'] != 'NONE']
#responses only
R = data.loc[:,'EXT1':'OPN10']
R = R.astype('int')
for col in R.columns:
    R = R[R[col] != 0]
    data = data[data[col] != 0]
#segments
EXT = R.loc[:,'EXT1':'EXT10']
EST = R.loc[:,'EST1':'EST10']
AGR = R.loc[:,'AGR1':'AGR10']
CSN = R.loc[:,'CSN1':'CSN10']
OPN = R.loc[:,'OPN1':'OPN10']
print(R.shape)

(594453, 50)


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
batch_size = 32
input_dim = 49
output_dim = 5

class Data(Dataset):
    def __init__(self,X_train,y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.X.shape[0]
    
    def __getitem__(self, index):
        return self.X[index],self.y[index]
    
    def __len__(self):
        return self.len

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.l1 = nn.Linear(input_dim,50)
        self.l2 = nn.Linear(50,50)
        self.l3 = nn.Linear(50,output_dim)
    
    def forward(self,x):
        x = torch.sigmoid(self.l1(x))
        x = torch.sigmoid(self.l2(x))
        x = torch.softmax(self.l3(x),1)
        return x

for target in ['EXT7','EST1','AGR8','CSN3','OPN7']:#R.columns
    print(f"predicting {target}")
    preds = R.copy()
    targ = preds.loc[:,target]
    preds = preds.drop(columns=target)
    X,y = preds.to_numpy(),targ.to_numpy()
    for i in range(len(y)):
        y[i] = y[i]-1
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=0)

    traindata = Data(X_train,y_train)
    testdata = Data(X_test,y_test)
    trainloader = DataLoader(traindata,batch_size=batch_size,shuffle=True)
    testloader = DataLoader(testdata,batch_size=batch_size,shuffle=True)
    clf = Net()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(clf.parameters(),lr=3e-3)

    epochs = 5
    for epoch in range(epochs):
        running_loss = 0.0
        for i,data in enumerate(trainloader,0):
            inputs,labels = data
            optimizer.zero_grad()
            outputs=clf(inputs)
            loss=criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()
        print(f'[{epoch + 1}] Training loss: {running_loss/len(X_train):.5f}')
    
    correct,total = 0,0
    with torch.no_grad():
        for data in testloader:
            inputs,labels=data
            outputs=clf(inputs)
            _,predicted = torch.max(outputs.data,1)
            total+=labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy of the network on the {len(testdata)} test data: {100* correct // total:.2f} %')


predicting EXT7
[1] Training loss: 0.04974
[2] Training loss: 0.04958
[3] Training loss: 0.04919
[4] Training loss: 0.04773
[5] Training loss: 0.04655
Accuracy of the network on the 89168 test data: 41.00 %
predicting EST1
[1] Training loss: 0.04953
[2] Training loss: 0.04940
[3] Training loss: 0.04934
[4] Training loss: 0.04917
[5] Training loss: 0.04858
Accuracy of the network on the 89168 test data: 34.00 %
predicting AGR8
[1] Training loss: 0.04681
[2] Training loss: 0.04661
[3] Training loss: 0.04659
[4] Training loss: 0.04658
[5] Training loss: 0.04657
Accuracy of the network on the 89168 test data: 40.00 %
predicting CSN3
[1] Training loss: 0.04606
[2] Training loss: 0.04578
[3] Training loss: 0.04573
[4] Training loss: 0.04564
[5] Training loss: 0.04544
Accuracy of the network on the 89168 test data: 45.00 %
predicting OPN7
[1] Training loss: 0.04569
[2] Training loss: 0.04539
[3] Training loss: 0.04527
[4] Training loss: 0.04495
[5] Training loss: 0.04431
Accuracy of the netwo

In [None]:
# results paste
# predicting EXT7
# [1] Training loss: 0.04974
# [2] Training loss: 0.04958
# [3] Training loss: 0.04919
# [4] Training loss: 0.04773
# [5] Training loss: 0.04655
# Accuracy of the network on the 89168 test data: 41.00 %
# predicting EST1
# [1] Training loss: 0.04953
# [2] Training loss: 0.04940
# [3] Training loss: 0.04934
# [4] Training loss: 0.04917
# [5] Training loss: 0.04858
# Accuracy of the network on the 89168 test data: 34.00 %
# predicting AGR8
# [1] Training loss: 0.04681
# [2] Training loss: 0.04661
# [3] Training loss: 0.04659
# [4] Training loss: 0.04658
# [5] Training loss: 0.04657
# Accuracy of the network on the 89168 test data: 40.00 %
# predicting CSN3
# [1] Training loss: 0.04606
# [2] Training loss: 0.04578
# [3] Training loss: 0.04573
# [4] Training loss: 0.04564
# [5] Training loss: 0.04544
# Accuracy of the network on the 89168 test data: 45.00 %
# predicting OPN7
# [1] Training loss: 0.04569
# [2] Training loss: 0.04539
# [3] Training loss: 0.04527
# [4] Training loss: 0.04495
# [5] Training loss: 0.04431
# Accuracy of the network on the 89168 test data: 49.00 %