## Installing and Importing the required libraries

In [1]:
! pip install pandas
! pip install torch
! pip install gensim
! pip install nltk
! pip install scikit-learn
! pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer  # Import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

[nltk_data] Downloading package punkt to /home/msaxena4/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/msaxena4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/msaxena4/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2023-09-17 16:44:25.251069: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Reading in the data from Json files and converting them to Pandas DataFrames

In [3]:
def convertJsontoDataframe(path):
    # Load JSON data from the file
    with open(path, 'r') as json_file:
        data = json.load(json_file)
    
    # Create a list of dictionaries containing "report" and "event_id" keys
    report_list = []
    for event_report, event_id in data.items():
        report_list.append({"event_report": event_report, "event_id": event_id})

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(report_list)

    return df

In [4]:
train_data = convertJsontoDataframe("train.json")
test_data = convertJsontoDataframe("test.json")
print(train_data.shape, test_data.shape)

(5629, 2) (2831, 2)


In [20]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5629 entries, 0 to 5628
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   event_report  5629 non-null   object
 1   event_id      5629 non-null   object
dtypes: object(2)
memory usage: 88.1+ KB


In [21]:
train_data.head()

Unnamed: 0,event_report,event_id
0,net lived lofty expectation month basketball l...,16
1,duck time allowing goal game season instance s...,23
2,invezz wednesday november hewlett packard nyse...,24
3,nov photo provided george loegering large spin...,143
4,actress jennifer love hewitt birth baby girl n...,11


## Data Preprocessing

In [5]:
def preprocessing(df):  
    """
    This function comprises of the following steps:
    1. Lowercaseing
    2. Tokenisation
    3. Stopword Removal
    4. Punctuation and Special Character removal
    5. Lemmatization
    6. Reconstruction

    It takes in series data of text as input and returns a series of preprocessed text.
    """
    preprocessed_data = []
    
    for text in df:
        text = text.lower()    # Lowercase the text    
        tokens = word_tokenize(text)   # Tokenization

        stop_words = set(stopwords.words('english'))    
        tokens = [word for word in tokens if word not in stop_words]    # Remove stopwords
        tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation and special characters

        lemmatizer = WordNetLemmatizer()     # Perform lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        preprocessed_text = ' '.join(tokens)   # Join tokens back into a string
        preprocessed_data.append(preprocessed_text)

    return pd.Series(preprocessed_data)

In [6]:
# Preprocess train and test data
X_train = preprocessing(train_data['event_report'])
X_test = preprocessing(test_data['event_report'])

In [22]:
X_train.head()

0    net lived lofty expectation month basketball l...
1    duck time allowing goal game season instance s...
2    invezz wednesday november hewlett packard nyse...
3    nov photo provided george loegering large spin...
4    actress jennifer love hewitt birth baby girl n...
dtype: object

In [23]:
X_train.shape, X_test.shape

((5629,), (2831,))

In [24]:
X_train[0]

'net lived lofty expectation month basketball long shot team utilized expected bench player averaging minute'

## Implementing the Nearest Neighbours Search Algorithm
Note that the code is slightly modified for each of the models, since the embeddings generated from each much be handled in a different manner. Especially BERT, which uses Label encoded event_id values, which sets it apart from the other two models

In [7]:
def KNearestNeighbours(k, train_embeddings, test_embeddings, y_train, y_test, model):
    # Initialize k-Nearest Neighbors with cosine similarity
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')   
    knn.fit(train_embeddings)    # training the model

    top_1_accuracy = []         # initialising accuracy 
    top_3_accuracy = []
    top_5_accuracy = []

    for i in range(test_data.shape[0]):
        test_point = test_embeddings[i]
        
        if model == "TF-IDF":               # only for TF-IDF model
            distances, indices = knn.kneighbors(test_point)
        elif model == "Word2Vec":          # only for Word2Vec model
            distances, indices = knn.kneighbors([test_point])
        elif model == "BERT":              # 
            test_point = test_point.reshape(1, -1) # Reshape to (1, embedding_dim) for a single test point
            distances, indices = knn.kneighbors(test_point)
        
        test_point = test_embeddings[i].reshape(1, -1)  
        
        if model != "BERT":
            # Check event_id for the k-nearest neighbors
            neighbor_event_ids = [train_data['event_id'].iloc[idx] for idx in indices[0]]

            # Calculate and store accuracy scores
            if test_data['event_id'].iloc[i] in neighbor_event_ids[:1]:
                top_1_accuracy.append(1)
            else:
                top_1_accuracy.append(0)

            if test_data['event_id'].iloc[i] in neighbor_event_ids[:3]:
                top_3_accuracy.append(1)
            else:
                top_3_accuracy.append(0)

            if test_data['event_id'].iloc[i] in neighbor_event_ids[:5]:
                top_5_accuracy.append(1)
            else:
                top_5_accuracy.append(0)   
                
        else:
            # Check event_id for the k-nearest neighbors
            neighbor_event_ids = [y_train[idx] for idx in indices[0]]
    
            # Calculate and store accuracy scores
            if y_test[i] in neighbor_event_ids[:1]:
                top_1_accuracy.append(1)
            else:
                top_1_accuracy.append(0)
    
            if y_test[i] in neighbor_event_ids[:3]:
                top_3_accuracy.append(1)
            else:
                top_3_accuracy.append(0)
    
            if y_test[i] in neighbor_event_ids[:5]:
                top_5_accuracy.append(1)
            else:
                top_5_accuracy.append(0)      

    # Calculate final accuracy scores
    accuracy_top_1 = np.mean(top_1_accuracy)
    accuracy_top_3 = np.mean(top_3_accuracy)
    accuracy_top_5 = np.mean(top_5_accuracy)

    print("Top-1 Accuracy:", accuracy_top_1)
    print("Top-3 Accuracy:", accuracy_top_3)
    print("Top-5 Accuracy:", accuracy_top_5)

## Word Embeddings

### i) Baseline Approach: TF-IDF

In [8]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
train_embeddings = tfidf_vectorizer.fit_transform(X_train)
test_embeddings = tfidf_vectorizer.transform(X_test)

k = 5
# for k in range(1, 11):
print(f'\nk = {k}')
KNearestNeighbours(k, train_embeddings, test_embeddings, train_data, test_data, "TF-IDF")


k = 5
Top-1 Accuracy: 0.7972447898269163
Top-3 Accuracy: 0.9085128929706817
Top-5 Accuracy: 0.9318262098198516


### ii) Proposed Approach: Word2Vec

Steps:
1. Tokenize the preprocessed text
2. Get embeddings for train and test data. Implement a get_sentence_embedding(sentence, model) function for this
3. Convert the embeddings to array to use in the Nearest Neighbours Algorithm

In [9]:
# Define the get_sentence_embedding function
def get_sentence_embedding(sentence, model):
    # Initialize an empty array for the sentence embedding
    sentence_embedding = np.zeros(model.vector_size)
    num_words = 0

    for word in sentence:
        if word in model.wv:
            sentence_embedding += model.wv[word]
            num_words += 1

    if num_words > 0:
        sentence_embedding /= num_words

    return sentence_embedding

In [25]:
# Tokenize the preprocessed text (assuming you have the NLTK library installed)
tokenized_train_data = [word_tokenize(text) for text in X_train]
tokenized_test_data = [word_tokenize(text) for text in X_test]

In [28]:
len(tokenized_train_data), len(tokenized_test_data)

(5629, 2831)

In [29]:
tokenized_train_data[0]

['net',
 'lived',
 'lofty',
 'expectation',
 'month',
 'basketball',
 'long',
 'shot',
 'team',
 'utilized',
 'expected',
 'bench',
 'player',
 'averaging',
 'minute']

In [30]:
model = Word2Vec(sentences = tokenized_train_data, vector_size=300, window=25, min_count=2, sg=1)
model.save("word2vec.model")

# Load the Word2Vec model
model = Word2Vec.load("word2vec.model")

In [31]:
# Tokenize and get embeddings for train and test data
train_embeddings = [get_sentence_embedding(sentence, model) for sentence in tokenized_train_data]
test_embeddings = [get_sentence_embedding(sentence, model) for sentence in tokenized_test_data]


In [34]:
len(train_embeddings)

5629

In [35]:
train_embeddings = np.array(train_embeddings)
test_embeddings = np.array(test_embeddings)

In [36]:
k = 5
KNearestNeighbours(k, train_embeddings, test_embeddings, train_data, test_data, "Word2Vec")

Top-1 Accuracy: 0.8339809254680325
Top-3 Accuracy: 0.9092193571176262
Top-5 Accuracy: 0.9254680324973508


### iii) Proposed Approach: BERT

Steps: 
1. Load BERT tokenizer and model
2. Prepare the preprocessed data by tokenising it
3. Perform label encoding on the "event_id" column
4. Get embeddings for the input_ids from tokenised data. Implement a function get_embeddings(data_tokenized) for this.
5. Convert these embeddings to array from and reshape it to the needed shape to run Nearest Neighbours Algorithm

In [11]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [12]:
# Tokenize and preprocess your data
def data_preparation(data):
    tokenized_data = [tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=128) for text in data]
    return tokenized_data

In [13]:
training_data_tokenized = data_preparation(X_train)
testing_data_tokenized = data_preparation(X_test)

In [39]:
training_data_tokenized[0]

{'input_ids': tensor([[  101,  5658,  2973, 19459,  2100, 17626,  3204,  3455,  2146,  2915,
          2136, 12550,  3517,  6847,  2447, 14985,  3371,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [14]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['event_id'])
y_test = label_encoder.fit_transform(test_data['event_id'])

In [15]:
# Generate embeddings for training and testing data
def get_embeddings(data_tokenized):
    embeddings = []
    for input_ids in data_tokenized:
        with torch.no_grad():
            outputs = model(**input_ids)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
    return embeddings

In [16]:
training_embeddings = get_embeddings(training_data_tokenized)
testing_embeddings = get_embeddings(testing_data_tokenized)

In [44]:
len(training_embeddings)

5629

In [45]:
training_embeddings_array = np.array(training_embeddings)
testing_embeddings_array = np.array(testing_embeddings) 

In [42]:
training_embeddings_array.shape

(5629, 768)

In [18]:
training_embeddings_array = training_embeddings_array.reshape(training_embeddings_array.shape[0], training_embeddings_array.shape[-1])
testing_embeddings_array = testing_embeddings_array.reshape(testing_embeddings_array.shape[0], testing_embeddings_array.shape[-1])


In [19]:
k = 5
KNearestNeighbours(k, training_embeddings_array, testing_embeddings_array, y_train, y_test, "BERT")

Top-1 Accuracy: 0.7997174143412222
Top-3 Accuracy: 0.9007417873542918
Top-5 Accuracy: 0.9272341928647121
