In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import sklearn
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
from torch import optim
import sklearn
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
import pandas as pd
import glob
import os

# Set the directory path
dir_path = r'Dataset'

# Get a list of all JSON files in the specified directory and its subdirectories
json_files = glob.glob(os.path.join(dir_path, '**', '*.json'), recursive=True)

# Create an empty dataframe to store the merged data
merged_df = pd.DataFrame()

# Loop through each file and read it into a pandas dataframe
for f in json_files:
    # Read the json file into a dataframe
    df = pd.read_json(f, orient='records', lines=True)[["pos", "spd"]]

    # Extract the label from the file prefix
    filename = os.path.basename(f)

    if filename.startswith('traceJSON'):
        label = filename.split('-')[3][1:]
    
        # Add the label column to the dataframe
        if int(label) == 0:
            df['label'] = 0
        elif int(label) >=1 and int(label) <= 9:
            df['label'] = 1
        else:
            df['label'] = 2
        

        # Append the dataframe to the merged dataframe
        merged_df = pd.concat([merged_df, df], ignore_index=True)

# Now merged_df contains all the data from the JSON files, with label column added

In [3]:
merged_df

Unnamed: 0,pos,spd,label
0,"[873.6579894237055, 515.0419704516012, 0.0]","[-0.5615534138496401, 0.34935102148086505, 0.0]",0
1,"[872.1848544954322, 515.9474424294032, 0.0]","[-2.125938318465096, 1.322478270765143, 0.0]",0
2,"[869.1128639211374, 517.7971405872365, 0.0]","[-3.5899683398356963, 2.233296724389207, 0.0]",0
3,"[864.6579538884271, 520.5743407739925, 0.0]","[-5.312629533686103, 3.304274438501985, 0.0]",0
4,"[860.0207470696706, 525.7162975093687, 0.0]","[-5.34186098834915, 5.817769455952666, 0.0]",0
...,...,...,...
5711,"[325.90687464156844, 730.1637061673084, 0.0]","[8.1339354760141, 7.116579088873951, 0.0]",0
5712,"[337.2546973533207, 729.634766863171, 0.0]","[12.196912885390818, -4.544761986234479, 0.0]",0
5713,"[345.51983998911123, 719.5846003226957, 0.0]","[6.73889447260984, -9.893360827534599, 0.0]",0
5714,"[350.82649247285724, 711.7160476513725, 0.0]","[4.224393707179231, -6.201659069760074, 0.0]",0


In [4]:
split = pd.DataFrame(merged_df['pos'].to_list(), columns = ['posX', 'posY','posZ'])
split = split.drop('posZ',axis=1)
merged_df = pd.concat([merged_df, split], axis=1) 
merged_df = merged_df.drop('pos',axis=1)

In [5]:
split = pd.DataFrame(merged_df['spd'].to_list(), columns = ['spdX', 'spdY','spdZ'])
split = split.drop('spdZ',axis=1)
merged_df = pd.concat([merged_df, split], axis=1) 
merged_df = merged_df.drop('spd',axis=1)

In [6]:
from sklearn.preprocessing import StandardScaler

#scale all the columns except the label column
columns_to_scale = merged_df.columns.drop('label')
merged_df[columns_to_scale] = StandardScaler().fit_transform(merged_df[columns_to_scale])

In [7]:
merged_df

Unnamed: 0,label,posX,posY,spdX,spdY
0,0,1.002547,-0.295349,-0.022623,0.035314
1,0,0.998287,-0.292121,-0.195437,0.141427
2,0,0.989403,-0.285526,-0.357166,0.240747
3,0,0.976520,-0.275624,-0.547465,0.357530
4,0,0.963110,-0.257291,-0.550694,0.631611
...,...,...,...,...,...
5711,0,-0.581491,0.471636,0.937950,0.773238
5712,0,-0.548675,0.469750,1.386778,-0.498359
5713,0,-0.524773,0.433918,0.783842,-1.081591
5714,0,-0.509426,0.405863,0.506071,-0.679033


In [8]:
df_new = pd.DataFrame()
df_new = pd.concat([df_new, merged_df.loc[merged_df['label']==0].sample(frac=0.3)], ignore_index=True)
df_new = pd.concat([df_new, merged_df.loc[merged_df['label']==1]], ignore_index=True)
df_new = pd.concat([df_new, merged_df.loc[merged_df['label']==2]], ignore_index=True)
df_new.loc[df_new['label']==2]

Unnamed: 0,label,posX,posY,spdX,spdY
1819,2,0.771836,0.555587,-0.185285,-0.058431
1820,2,0.764458,0.554298,-0.384405,-0.076939
1821,2,0.751292,0.557101,-0.580195,0.103151
1822,2,0.733394,0.567206,-0.724216,0.413355
1823,2,0.716500,0.591541,-0.596702,0.874907
...,...,...,...,...,...
2689,2,-1.138951,1.351121,-0.042162,1.084084
2690,2,-1.141381,1.386592,-0.042296,1.085854
2691,2,-1.143821,1.422045,-0.041797,1.079225
2692,2,-1.145615,1.456448,-0.029910,0.920895


In [9]:
class NN(nn.Module):
    def __init__(self,input_size,num_classes):
        super(NN,self).__init__()
        self.fc1 = nn.Linear(input_size,50)
        self.fc2 = nn.Linear(50,30)
        self.fc3 = nn.Linear(30,20)
        self.fc4 = nn.Linear(20,num_classes)
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [10]:
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
#load data
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe[['posX', 'posY', 'spdX', 'spdY']].values
        self.targets = dataframe['label'].values

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        data = torch.tensor(self.data[index], dtype=torch.float)
        target = torch.tensor(self.targets[index], dtype=torch.long)
        return data, target

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
merged_df['label'] = le.fit_transform(merged_df['label'])

In [13]:
# split dataframe into train and test sets
train_df, test_df = train_test_split(df_new, test_size=0.2, random_state=42)

In [14]:
#hyperparameters
input_size = 4
num_classes = 20
learning_rate = 0.001
batch_size = 64
num_epochs = 200

In [15]:
# create dataset and dataloader
train_dataset = CustomDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [16]:
#initialse network
model = NN(input_size=input_size,num_classes=num_classes).to(device)

In [17]:
#loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

In [18]:
#train network
for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        if batch_idx % 200 == 0:
          print('In batch: ', batch_idx, 'In epoch:', epoch)
        data = data.to(device=device)
        target = target.to(device=device)
        #get to correct shape
        data = data.reshape(data.shape[0],-1)
        #forward
        scores = model(data)
        loss = criterion(scores,target)
        #backward
        optimizer.zero_grad()
        loss.backward()
        #gradient descent
        optimizer.step()

In batch:  0 In epoch: 0
In batch:  0 In epoch: 1
In batch:  0 In epoch: 2
In batch:  0 In epoch: 3
In batch:  0 In epoch: 4
In batch:  0 In epoch: 5
In batch:  0 In epoch: 6
In batch:  0 In epoch: 7
In batch:  0 In epoch: 8
In batch:  0 In epoch: 9
In batch:  0 In epoch: 10
In batch:  0 In epoch: 11
In batch:  0 In epoch: 12
In batch:  0 In epoch: 13
In batch:  0 In epoch: 14
In batch:  0 In epoch: 15
In batch:  0 In epoch: 16
In batch:  0 In epoch: 17
In batch:  0 In epoch: 18
In batch:  0 In epoch: 19
In batch:  0 In epoch: 20
In batch:  0 In epoch: 21
In batch:  0 In epoch: 22
In batch:  0 In epoch: 23
In batch:  0 In epoch: 24
In batch:  0 In epoch: 25
In batch:  0 In epoch: 26
In batch:  0 In epoch: 27
In batch:  0 In epoch: 28
In batch:  0 In epoch: 29
In batch:  0 In epoch: 30
In batch:  0 In epoch: 31
In batch:  0 In epoch: 32
In batch:  0 In epoch: 33
In batch:  0 In epoch: 34
In batch:  0 In epoch: 35
In batch:  0 In epoch: 36
In batch:  0 In epoch: 37
In batch:  0 In epoch:

In [19]:
#check accuracy
def check_accuracy(loader,model):
    num_correct = 0
    num_samples = 0
    y_list = []
    predictions_list = []
    model.eval()
    
    with torch.no_grad():
        for x,y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            x = x.reshape(x.shape[0],-1)
#             print(y.shape)
            y_list.extend(y.tolist())
            
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
            predictions_list.extend(predictions.tolist())
        
        print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
    
#     print(len(y_list))
#     print(len(predictions_list))
    print(classification_report(y_list,predictions_list))
    print("Confusion Matrix")
    print(confusion_matrix(y_list,predictions_list))
    model.train()

In [20]:
check_accuracy(train_loader,model)

Got 1593 / 2155 with accuracy 73.92
              precision    recall  f1-score   support

           0       0.77      0.73      0.75      1025
           1       0.73      0.72      0.73       421
           2       0.70      0.77      0.73       709

    accuracy                           0.74      2155
   macro avg       0.74      0.74      0.74      2155
weighted avg       0.74      0.74      0.74      2155

Confusion Matrix
[[746  80 199]
 [ 82 304  35]
 [135  31 543]]


In [21]:
check_accuracy(test_loader,model)

Got 352 / 539 with accuracy 65.31
              precision    recall  f1-score   support

           0       0.72      0.65      0.68       270
           1       0.60      0.63      0.61       103
           2       0.60      0.67      0.64       166

    accuracy                           0.65       539
   macro avg       0.64      0.65      0.64       539
weighted avg       0.66      0.65      0.65       539

Confusion Matrix
[[175  34  61]
 [ 25  65  13]
 [ 44  10 112]]


In [22]:
torch.save(model.state_dict(), 'saved_models\curr_model')