In [149]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import LabelEncoder
# from torch_sparse import coalesce
#from pytorch_sparse import to_torch_sparse_tensor
import scipy.sparse as sp
from torch_geometric.utils import from_scipy_sparse_matrix


Reading the Dataset

In [150]:
data = pd.read_csv('preprocessed.csv')

Defining a function to tokenize the service description and remove the stop words

In [151]:
def makeTokens(f):
    tkns_BySlash = str(f.encode('utf-8')).split('/')
    total_Tokens = []

    for i in tkns_BySlash:
        tokens = str(i).split('-')
        tkns_ByDot = []
        for j in range(0, len(tokens)):
            temp_Tokens = str(tokens[j]).split('.')
            tkns_ByDot = tkns_ByDot + temp_Tokens
        total_Tokens = total_Tokens + tokens + tkns_ByDot
    total_Tokens = list(set(total_Tokens))

    if 'com' in total_Tokens:
        total_Tokens.remove('com')

    return total_Tokens

In [152]:
desc_list = data['Service Description']
y = data['Primary Category']

Creating the TfidfVectorizer

In [153]:
vectorizer = TfidfVectorizer(tokenizer=makeTokens)
X = vectorizer.fit_transform(desc_list)



Spliting the training and testing sets

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Creating the data to tensors

In [155]:
x_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
x_test = torch.tensor(X_test.toarray(), dtype=torch.float32)

In [156]:
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(y_train), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(y_test), dtype=torch.long)

In [157]:
num_nodes = x_train.shape[0]
edge_index = torch.tensor([[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j], dtype=torch.long).t()

In [158]:
adj_matrix = X_train * X_train.T
adj_matrix = adj_matrix.tocoo()

# Create the edge index from the COO matrix
edge_index = torch.tensor([adj_matrix.row, adj_matrix.col], dtype=torch.long)

# Create the torch_geometric Data object
train_data = Data(x=x_train, edge_index=edge_index, y=y_train)

Defining the GCN Model

In [159]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [160]:
vocabulary_size = x_train.shape[1]
hidden_channels = 16
num_classes = len(label_encoder.classes_)
model = GCN(in_channels=vocabulary_size, hidden_channels=hidden_channels, out_channels=num_classes)

Optimizer and loss function

In [161]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

Training the model

In [162]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x, train_data.edge_index)
    loss = criterion(out, train_data.y)
    loss.backward()
    optimizer.step()

In [163]:
num_epochs = 20
for epoch in range(num_epochs):
    train()

In [180]:
def evaluate():
    model.eval()
    num_test_nodes = x_test.shape[0]
    edge_index_eval = torch.tensor([[i, j] for i in range(num_nodes, num_nodes + num_test_nodes) for j in range(num_nodes)], dtype=torch.long).t()
    out = model(torch.cat([x_train, x_test], dim=0), torch.cat([train_data.edge_index, edge_index_eval], dim=1))
    _, predicted = torch.max(out[-num_test_nodes:], dim=1)
    correct = (predicted == y_test).sum().item()
    total = y_test.size(0)
    accuracy = correct / total
    return accuracy


In [181]:
accuracy = evaluate()
print("Accuracy: {:.4f}".format(accuracy))

Accuracy: 0.2486
