In [2]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
import random

In [3]:
import nltk

# Downloading NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Creating a TF-IDF Model Class
class TfIdfModel:
    
    # TODO. Add getter setter for all variables
    # TODO. Add update Data and Recalculate TF-IDF Functionality
    # TODO. Add everything else in the Class
    
    # Constructor
    def __init__(self, docs_data):
        
        self.number_of_docs = len(docs_data)
        self.document_collection = docs_data
        self.tokenized_collection = dict()
        self.vocabulary = set()
        self.inverted_index = dict()
        self.document_frequency = dict()
        self.term_document_frequency = dict()
        self.max_freq_in_doc = dict()
        self.tf_idf_scores = dict()
        
        # Tokenize the documents to build the tokenized collection
        print("Tokenizing the documents...")
        self.tokenize_documents()
        print("Tokenization complete!")
        
        # Create the vocabulary
        print("Creating the vocabulary...")
        self.create_vocabulary()
        print("Vocabulary created!")
        
        # Create the inverted index
        print("Creating the inverted index...")
        self.create_inverted_index()
        print("Inverted index created!")
        
                
        # Calculate the document frequency
        print("Calculating the document frequency...")
        for term in self.vocabulary:
            self.document_frequency[term] = len(self.inverted_index[term])
        print("Document frequency calculated!")
        
        
        # Calculate the term document frequency
        print("Calculating the term document frequency...")
        for word in self.vocabulary:
            self.term_document_frequency[word] = dict()
            
            for doc in self.inverted_index[word]:
                self.term_document_frequency[word][doc[0]] = doc[1]
                       
            for doc in set(list(self.document_collection.keys())) - set(self.term_document_frequency[word].keys()):
                self.term_document_frequency[word][doc] = 0
        print("Term document frequency calculated!")
        
        
        # Calculate the max frequency in a document
        print("Calculating the max frequency in a document...")
        for doc_id in self.document_collection:
            self.max_freq_in_doc[doc_id] = 0
            for word in self.tokenized_collection[doc_id]:
                if self.term_document_frequency[word][doc_id] > self.max_freq_in_doc[doc_id]:
                    self.max_freq_in_doc[doc_id] = self.term_document_frequency[word][doc_id]
        print("Max frequency in a document calculated!")                    

        
        # Calculate the TF-IDF Scores
        print("Calculating the TF-IDF scores...")
        self.calculate_tf_idf_scores()
        print("TF-IDF scores calculated!")
        
    
    # Function to get a list of all documents
    # Input: None
    # Output: List of all documents : List[Str]
    def get_document_list(self):
        return list(self.document_collection.keys())
    
    def get_document_content(self, doc_id):
        try:
            return self.document_collection[doc_id]
        except KeyError:
            raise Exception("KeyError: Document ID not found")
    
    
    
    # Function to get the tokens of a document
    # Input: Document ID : Str
    # Output: List of tokens : List[Str]
    def get_document_tokens(self, doc_id):
        try:
            return self.tokenized_collection[doc_id]
        except KeyError:
            raise Exception("KeyError: Document ID not found")
    
    
    # Function to extract keywords from a document
    # Input: Document ID : Str
    # Output: List of keywords : List[Str]
    def extract_keywords(self, text):
        tokens = nltk.word_tokenize(text)
        
        # Remove stop words
        tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
        
        # Stem the words
        tokens = [nltk.PorterStemmer().stem(word) for word in tokens]
        
        # Remove Punctuation
        tokens = [word for word in tokens if word.isalpha()]
        return tokens


    # Function to tokenize the documents
    # Input: None
    # Output: None (Updates the tokenized collection dictionary)
    def tokenize_documents(self):
        for doc_id in self.document_collection:
            
            # Tokenize the document
            tokens = self.extract_keywords(self.get_document_content(doc_id))
            
            # Add the tokens to the collection
            self.tokenized_collection[doc_id] = tokens
    
    
    # Function to Create the vocabulary
    # Input: None
    # Output: None (Updates the vocabulary set)
    def create_vocabulary(self):
        for doc_id in self.tokenized_collection:
            self.vocabulary.update(self.tokenized_collection[doc_id])
    
    
    # Function to get the vocabulary
    # Input: None
    # Output: List of vocabulary : List[Str]
    def get_vocabulary(self):
        return list(self.vocabulary)
    
    
    # Function to create the inverted index
    # Input: None
    # Output: None (Updates the inverted index dictionary)
    def create_inverted_index(self):
        
        for word in self.vocabulary:
            self.inverted_index[word] = []

        for doc_id in self.tokenized_collection:

            word_frequency = dict()
            for word in self.tokenized_collection[doc_id]:
                if word in word_frequency:
                    word_frequency[word] += 1
                else:
                    word_frequency[word] = 1
            
            for word in word_frequency:
                self.inverted_index[word].append((doc_id, word_frequency[word]))
            
    # Get the inverted index
    # Input: None
    # Output: Inverted index dictionary : Dict[Str, List[Tuple[Str, Int]]]
    def get_inverted_index(self):
        return self.inverted_index
    
    
    # Get the postings list of a word
    # Input: Word : Str
    # Output: Postings list : List[Tuple[Str, Int]]
    def get_postings_list(self, word):
        try:
            return self.inverted_index[word]
        except KeyError:
            raise Exception("KeyError: Word not found")
    
    
    # Calculate the Tf-IDf scores of the documents
    # Input: None
    # Output: None (Updates the tf-idf scores dictionary)
    def calculate_tf_idf_scores(self):
        for doc in self.document_collection:
            self.tf_idf_scores[doc] = {}
            for word in self.tokenized_collection[doc]:
                self.tf_idf_scores[doc][word] = 0.5 + 0.5*(self.term_document_frequency[word][doc]/self.max_freq_in_doc[doc]) * np.log(self.number_of_docs/self.document_frequency[word] + 1)
    
    # Get Tf-Idf Dataframe
    # Input: None
    # Output: Tf-Idf Dataframe : Dataframe
    def get_tf_idf_dataframe(self):
        return pd.DataFrame.from_dict(tf_idf_model.tf_idf_scores)

[nltk_data] Downloading package punkt to C:\Users\shubham
[nltk_data]     gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\shubham
[nltk_data]     gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\shubham
[nltk_data]     gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


READING THE DATA FROM THE DATASET

In [None]:
# Path variables
docs_data = {}
DocumentsPath = './alldocs/'
Total_Docs = len(os.listdir(DocumentsPath))
print("# of Documents:", Total_Docs)
output_path = "./output.txt"
queries_path = "./query.txt"

# of Documents: 6377


In [None]:
with open(output_path, 'r') as f:
    
    # Read the file and store it in a list
    try:
        output_data = f.read().splitlines()
    except:
        print("Error in reading file:", output_path)

In [None]:
# Indexing Data

inv_classes_dict = {}
for line in output_data:
    line_list = line.split(" ")
    query_id = line_list[0]
    doc_id = line_list[1]
    if doc_id in inv_classes_dict.keys():
        inv_classes_dict[doc_id] = None
    else:
        inv_classes_dict[doc_id] = query_id

classes_dict = {}
for doc_id in inv_classes_dict.keys():
    query_id = inv_classes_dict[doc_id]
    if query_id is not None:
        if query_id not in classes_dict.keys():
            classes_dict[query_id] = []
        else :
            classes_dict[query_id].append(doc_id)
classes_dict.keys()

dict_keys(['701', '702', '703', '704', '705', '706', '707', '708', '709', '710', '711', '712', '713', '714', '715', '716', '717', '718', '719', '720', '721', '722', '723', '724', '725', '726', '727', '728', '729', '730', '731', '732', '733', '734', '735', '736', '737', '738', '739', '740', '741', '742', '743', '744', '745', '746', '747', '748', '749', '750', '751', '752', '753', '754', '755', '756', '757', '758', '759', '760', '761', '762', '763', '764', '765', '766', '767', '768', '769', '770', '771', '772', '773', '774', '775', '776', '777', '778', '779', '780', '781', '782', '783', '784', '785', '786', '787', '788', '789', '790', '791', '792', '793', '794', '795', '796', '797', '798', '799', '800', '801', '802', '803', '804', '805', '806', '807', '808', '809', '810', '811', '812', '813', '814', '815', '816', '817', '818', '819', '820', '821', '822', '823', '824', '825', '826', '827', '828', '829', '830', '831', '832', '833', '834', '835', '836', '837', '838', '839', '840', '841', '8

In [None]:
# Split data into 70:30 ratio per class for train and test respectively
train_set = []
test_set = []
for query_id in classes_dict.keys():
    l = len(classes_dict[query_id])
    temp_list = classes_dict[query_id].copy()
    random.shuffle(temp_list)
    train_set.extend(temp_list[:int(.7*l)])
    test_set.extend(temp_list[int(.7*l):])
len(train_set), len(test_set)

(5050, 2246)

In [None]:
# Read all docs into a dataframe

Files_Not_Read = []

Number_of_Docs_To_Read = 100

DOCS = os.listdir(DocumentsPath)[:Number_of_Docs_To_Read]

# Iterate over all files in the directory
for file in DOCS:
    
    # Read the file
    with open(DocumentsPath + file, 'r') as f:
        
        # Read the file and store it in a list
        try:
            docs_data[file] = f.read()
        except:
            print("Error in reading file:", file)
            Files_Not_Read.append(file)

In [None]:
Total_Docs = len(docs_data)
print("# of Documents Read:", Total_Docs)
print(Files_Not_Read)

# of Documents Read: 100
[]


In [None]:
# Create a TF-IDF Model
tf_idf_model = TfIdfModel(docs_data)

print(tf_idf_model.number_of_docs)
print(tf_idf_model.get_document_list())

Tokenizing the documents...
Tokenization complete!
Creating the vocabulary...
Vocabulary created!
Creating the inverted index...
Inverted index created!
Calculating the document frequency...
Document frequency calculated!
Calculating the term document frequency...
Term document frequency calculated!
Calculating the max frequency in a document...
Max frequency in a document calculated!
Calculating the TF-IDF scores...
TF-IDF scores calculated!
100
['GX000-01-10544170', 'GX000-09-2703409', 'GX000-10-4524900', 'GX000-14-10770491', 'GX000-14-16748010', 'GX000-14-5445022', 'GX000-16-0145015', 'GX000-16-4063715', 'GX000-21-2440731', 'GX000-22-12322384', 'GX000-26-8535353', 'GX000-29-8328421', 'GX000-31-2605300', 'GX000-33-0298602', 'GX000-34-9679963', 'GX000-36-2289681', 'GX000-37-9159393', 'GX000-38-10952535', 'GX000-39-14470745', 'GX000-43-1993135', 'GX000-43-4226578', 'GX000-46-4667412', 'GX000-46-8625035', 'GX000-47-11266472', 'GX000-47-16664622', 'GX000-48-10208090', 'GX000-49-12224349'

In [None]:
print("Vocabulary Size:", len(tf_idf_model.get_vocabulary()))

Vocabulary Size: 16226


In [None]:
# Print tokens of first document
print("Tokens of first document:", tf_idf_model.get_document_tokens(tf_idf_model.get_document_list()[0]))

Tokens of first document: ['link', 'nation', 'cancer', 'institut', 'center', 'cancer', 'research', 'link', 'contact', 'ccr', 'link', 'ccr', 'homepag', 'link', 'nci', 'home', 'link', 'nih', 'home', 'search', 'compar', 'oncolog', 'program', 'home', 'introduct', 'specif', 'aim', 'time', 'for', 'implement', 'background', 'the', 'ccr', 'compar', 'oncolog', 'program', 'compliment', 'number', 'new', 'ccr', 'initi', 'design', 'improv', 'translat', 'process', 'program', 'announc', 'compar', 'oncolog', 'refer', 'studi', 'natur', 'develop', 'cancer', 'anim', 'model', 'human', 'diseas', 'a', 'signific', 'group', 'natur', 'occur', 'cancer', 'develop', 'pet', 'anim', 'primarili', 'cat', 'dog', 'these', 'larg', 'anim', 'cancer', 'share', 'mani', 'featur', 'human', 'cancer', 'includ', 'tumor', 'histolog', 'genet', 'respons', 'convent', 'therapi', 'biolog', 'behavior', 'exampl', 'model', 'includ', 'osteosarcoma', 'lymphoma', 'breast', 'cancer', 'head', 'neck', 'carcinoma', 'prostat', 'cancer', 'soft', 

In [None]:
print("Inverted Index Size:", len(tf_idf_model.get_inverted_index()))
print("Posting list of clinic:", tf_idf_model.get_postings_list("clinic"))

Inverted Index Size: 16226
Posting list of clinic: [('GX000-01-10544170', 9), ('GX000-14-10770491', 1), ('GX000-33-0298602', 64), ('GX000-39-14470745', 8), ('GX000-47-11266472', 1), ('GX000-53-0788254', 1), ('GX000-55-3026780', 1), ('GX000-76-13395038', 1), ('GX000-80-10421013', 6), ('GX000-93-5908540', 5), ('GX001-06-8404221', 6), ('GX001-10-2816967', 4), ('GX001-10-9697910', 2), ('GX001-12-8380564', 14), ('GX001-16-14126351', 23), ('GX001-19-3646687', 2), ('GX001-21-2455320', 1), ('GX001-28-1213107', 1), ('GX001-30-5435939', 4), ('GX001-37-11210821', 52), ('GX001-37-2825497', 3)]


In [None]:
print("Term Document Frequency of clinic:", tf_idf_model.term_document_frequency["clinic"])
print("Document Frequency of clinic:", tf_idf_model.document_frequency['clinic'])
print("Maximum Frequency in Document with ID GX000-14-10770491:", tf_idf_model.max_freq_in_doc['GX000-14-10770491'])

Term Document Frequency of clinic: {'GX000-01-10544170': 9, 'GX000-14-10770491': 1, 'GX000-33-0298602': 64, 'GX000-39-14470745': 8, 'GX000-47-11266472': 1, 'GX000-53-0788254': 1, 'GX000-55-3026780': 1, 'GX000-76-13395038': 1, 'GX000-80-10421013': 6, 'GX000-93-5908540': 5, 'GX001-06-8404221': 6, 'GX001-10-2816967': 4, 'GX001-10-9697910': 2, 'GX001-12-8380564': 14, 'GX001-16-14126351': 23, 'GX001-19-3646687': 2, 'GX001-21-2455320': 1, 'GX001-28-1213107': 1, 'GX001-30-5435939': 4, 'GX001-37-11210821': 52, 'GX001-37-2825497': 3, 'GX000-85-1129126': 0, 'GX001-41-4669647': 0, 'GX000-76-2231612': 0, 'GX000-90-7867636': 0, 'GX000-21-2440731': 0, 'GX000-46-4667412': 0, 'GX000-70-12794819': 0, 'GX000-09-2703409': 0, 'GX001-10-4196524': 0, 'GX001-21-4314384': 0, 'GX001-28-5148808': 0, 'GX000-63-3474016': 0, 'GX000-16-0145015': 0, 'GX000-38-10952535': 0, 'GX000-62-7241305': 0, 'GX001-35-0432171': 0, 'GX000-10-4524900': 0, 'GX000-66-9648548': 0, 'GX000-77-6847828': 0, 'GX001-37-1534868': 0, 'GX001-

In [None]:
# Print the TF-IDF of document with ID GX000-14-10770491
print("TF-IDF of GX000-14-10770491:", tf_idf_model.tf_idf_scores['GX000-14-10770491'])

TF-IDF of GX000-14-10770491: {'doj': 0.5333916235451629, 'logo': 0.5162074600785497, 'comput': 0.6943963173394017, 'crime': 0.5473693471456056, 'intellectu': 0.5951250704211603, 'properti': 0.5530171774562005, 'section': 0.514567011261574, 'ccip': 0.5757696869307322, 'document': 0.515455069154418, 'star': 0.5230349008007742, 'rule': 0.5162074600785497, 'year': 0.5501646600784528, 'hialeah': 0.6371567081182904, 'florida': 0.6468079412958136, 'man': 1.46448029537077, 'sentenc': 1.1182279459979205, 'crimin': 0.6718610420617886, 'copyright': 0.9236936697235043, 'infring': 0.7536668544564272, 'novemb': 0.5706895699416006, 'attorney': 0.5416688248657687, 'gener': 0.5198834931630316, 'ashcroft': 0.5457189027060968, 'announc': 0.5170504310324817, 'oper': 0.5798349391986288, 'cyber': 0.537884843465366, 'sweep': 0.5317083568070534, 'five': 0.515455069154418, 'men': 0.5921396032030969, 'charg': 0.8453300903823064, 'new': 0.6067721625395353, 'hampshir': 0.5557650063441482, 'softwar': 1.36014322921

In [None]:
# Create a dataframe to store the TF-IDF with the document id as the index
tf_idf_df = tf_idf_model.get_tf_idf_dataframe()

# Fill NaN values with 0
tf_idf_df = tf_idf_df.fillna(0)
tf_idf_df.head()

Unnamed: 0,GX000-01-10544170,GX000-09-2703409,GX000-10-4524900,GX000-14-10770491,GX000-14-16748010,GX000-14-5445022,GX000-16-0145015,GX000-16-4063715,GX000-21-2440731,GX000-22-12322384,...,GX001-37-11210821,GX001-37-1534868,GX001-37-16595181,GX001-37-2825497,GX001-39-3118899,GX001-41-4669647,GX001-43-0085597,GX001-43-11803200,GX001-44-13913188,GX001-45-9859222
link,0.642413,0.506953,0.530517,0.0,0.507423,0.527465,0.545776,0.525465,0.504629,0.508451,...,0.522269,0.0,0.518726,0.0,0.507847,0.0,0.0,0.517531,0.0,0.0
nation,0.530294,0.510354,0.0,0.519022,0.522106,0.0,0.534081,0.570418,0.0,0.509438,...,0.549739,0.0,0.5883,0.500764,0.590557,0.508098,0.0,0.504351,0.545441,0.526062
cancer,1.46448,0.0,0.0,0.0,0.0,0.0,0.0,0.506387,0.0,0.0,...,0.584718,0.0,0.0,0.505408,0.506889,0.0,0.0,0.0,0.0,0.0
institut,0.560303,0.0,0.560303,0.0,0.0,0.0,0.0,0.510783,0.501525,0.0,...,0.595344,0.0,0.604845,0.501014,0.519383,0.521494,0.0,0.0,0.548242,0.511972
center,0.536327,0.531039,0.0,0.511405,0.506627,0.0,0.513623,0.55846,0.502755,0.503772,...,0.576213,0.512575,0.527864,0.0,0.514012,0.0,0.0,0.0,0.521796,0.510818


In [None]:
# Store the DataFrame to disk
tf_idf_df.to_csv('./TF_IDF_Q3_Part1.csv')

In [None]:
# Implement Learning by Prototypes Classifier where class prototype is the mean of the training examples of that class 
# To classify a test image, find the prototype with the shortest distance.
from collections import OrderedDict

class Rocchio:

    # Constructor Function for our class
    def __init__(self):
        
        # class prototypes
        self.class_prototype = OrderedDict()  
        
        # label frequency array
        self.class_count = OrderedDict()  
        
        # labels
        self.class_name = []  

    # Training Function for our class
    def fit(self, X, y):

        # Loop over all the input feature vectors
        for i in range(len(y)):

            if y[i] not in self.class_name:
                
                # Encountering a new label for the first time
                self.class_name.append(y[i])
                self.class_prototype[y[i]] = np.zeros(len(X[0]))
                self.class_count[y[i]] = 0
            
            # Steps for the mean calculation for this class's prototype
            self.class_prototype[y[i]] += X[i]
            self.class_count[y[i]] += 1

        # Looping over the prototype dictionary to calculate mean
        for key in self.class_prototype:
            self.class_prototype[key] = self.class_prototype[key]/self.class_count[key]
    
    # Function to predict labels for multiple feature vectors based on Eucledian Distance
    def predict_E(self, X):
        
        # Predictions array
        pred = []

        # Looping over all feature vectors
        for i in range(len(X)):

            # Distance array
            dist = []
            
            # Calculating the distance of our feature vector from all the prototypes
            for key in self.class_prototype:
                dist.append(np.linalg.norm(X[i] - self.class_prototype[key]))

            # Taking the label of the prototype at the minimum distance
            pred.append(self.class_name[dist.index(min(dist))])
        
        # Returning the predictions array
        return np.array(pred)

In [None]:
# Implement the KNN algorithm classifier without using the sklearn library
class KNNClassifier:

  # Constructor Function for our class
    def __init__(self, k = 5):

        # by default k = 5
        self.k = k  
        
        # class prototypes
        self.X_train = None 
        
        # labels
        self.y_train = None 
        self.isFirst = True
        self.pre_dis = dict()
    

    # Training Function 
    def fit(self, X, y):

        # make sure that X is a 2D array and Y is a 1D array else raise an error 
        if len(X.shape) != 2 and len(y.shape) != 1:
            raise Exception("Error: X must be a 2D array")

        self.X_train = X
        self.y_train = y
    
    # Optimized predict function for HyperTuning using a fixed validation set
    def hyper_tuning_predict(self, X, method='U'):
      
        # Method parameter decides if we want to do a weighted (W) prediction or an unweighted (U) prediction
        # By default, we do an unweighted prediction
        if len(X.shape) != 2:
            raise Exception("Error: X must be a 2D array")
        
        if str(method).lower() != 'u' and str(method).lower() != 'w':
            raise Exception("Error: Please select a valid method : (U/W)")
        
        y_pred = []

        if str(method).lower() == 'u':
          
            for i in range(len(X)):
              
                # predicting the labels of all the feature vector one by one
                label = self.hyper_tuning_predict_one(X[i], i)
                y_pred.append(label)
                
        else:
          
            for i in range(len(X)):
              
                # predicting the labels of all the feature vector one by one
                label = self.hyper_tuning_predict_one_Weighted(X[i], i)
                y_pred.append(label)
                
        self.isFirst = False
        return y_pred

    # Unweighted Single Prediction Method for HyperTuning
    def hyper_tuning_predict_one(self, row, j):

        if self.isFirst:
          
            for i in range(len(self.X_train)):
              
                dist = self.distance(row, self.X_train[i])
                
                if self.pre_dis.get(j) != None:
                    self.pre_dis[j].append((dist, self.y_train[i]))
                    
                else:
                    self.pre_dis[j] = [(dist, self.y_train[i])]

            # Sorting the distance array 
            self.pre_dis[j].sort()

        neighbors = self.pre_dis[j][:self.k]

        weights = dict()
        freq = dict()
        maxfreq = 0
        
        # Calculating the Weights
        for i in range(len(neighbors)):
          
            label = neighbors[i][1]
            
            if weights.get(label) is not None:
                weights[label] += neighbors[i][0]
                freq[label] += 1
                
            else:
                weights[label] = neighbors[i][0]
                freq[label] = 1
            
            maxfreq = max(maxfreq, freq[label])
        
        minWeight = -1
        answer_label = 0

        # Finding the label with minimum weight.
        # Weight of a label is basically the sum of all distances from datapoints of that label
        # This significantly decreases the chances of a Tie.
        for key, val in weights.items():
            if (minWeight == -1 or minWeight > val) and freq[key] == maxfreq:
                answer_label = key
                minWeight = val

        return answer_label
    
    # Weighted Single Prediction method for hyperTuning
    def hyper_tuning_predict_one_Weighted(self, row, j):
      
        if self.isFirst:
          
            for i in range(len(self.X_train)):
              
              dist = self.distance(row, self.X_train[i])
              
              if self.pre_dis.get(j) != None:
                  self.pre_dis[j].append((dist, self.y_train[i]))
                  
              else:
                  self.pre_dis[j] = [(dist, self.y_train[i])]
            
            # Sorting the distance array 
            self.pre_dis[j].sort()

        neighbors = self.pre_dis[j][:self.k]

        # Weight of neighbors
        weights = dict()

        # Calculating the Weights
        for i in range(len(neighbors)):
          
            label = neighbors[i][1]
            
            if weights.get(label) is not None:
                weights[label] += neighbors[i][0]
              
            else:
                weights[label] = neighbors[i][0]
        
        minWeight = -1
        answer_label = 0

        # Finding the label with minimum weight.
        # Weight of a label is basically the some of all distances from datapoints of that label
        # This significantly decreases the chances of a Tie.
        for key, val in weights.items():
            if minWeight == -1 or minWeight > val:
                answer_label = key
                minWeight = val

        return answer_label

    # Function to predict the labels of array of Image
    def predict(self, X, method='U'):
      
        # Method parameter decides if we want to do a weighted (W) prediction or an unweighted (U) prediction
        # By default, we do an unweighted prediction
        if str(method).lower() not in ['u', 'w']:
            raise Exception("Error: Please select a valid method : (U/W)")

        # make sure that X is a 2D array else return an error
        if len(X.shape) != 2:
            raise Exception("Error: X must be a 2D array")

        # Predictions array
        y_pred = []

        if str(method).lower() == 'u':
          
          for row in X:
            
              # predicting the labels of all the feature vector one by one
              label = self.predict_one(row)
              y_pred.append(label)
              
        else:
          
          for row in X:
            
              # predicting the labels of all the feature vector one by one
              label = self.predict_one_Weighted(row)
              y_pred.append(label)
        
        return y_pred
    
    # Predict the label of a single Image
    def predict_one(self, row):
        
        # Distance array
        distances = []

        # Looping over the feature vector
        for i in range(len(self.X_train)):
            dist = self.distance(row, self.X_train[i])
            distances.append((dist, self.y_train[i]))
        
        # Sorting the distance array 
        distances.sort()

        # Extracting first k neighbors
        neighbors = distances[:self.k]

        weights = dict()
        freq = dict()
        maxfreq = 0
        
        # Calculating the Weights
        for i in range(len(neighbors)):
          
            label = neighbors[i][1]
            
            if weights.get(label) is not None:
              
                weights[label] += neighbors[i][0]
                freq[label] += 1
              
            else:
              
                weights[label] = neighbors[i][0]
                freq[label] = 1
            
            maxfreq = max(maxfreq, freq[label])
        
        minWeight = -1
        answer_label = 0

        # Finding the label with minimum weight.
        # Weight of a label is basically the sum of all distances from datapoints of that label
        # This significantly decreases the chances of a Tie.
        for key, val in weights.items():
            if (minWeight == -1 or minWeight > val) and freq[key] == maxfreq:
                answer_label = key
                minWeight = val

        return answer_label
    
    # Precit the label of a feature vector based on weighted KNN
    def predict_one_Weighted(self, row):
        
        # Distance array
        distances = []

        # Looping over the feature vector
        for i in range(len(self.X_train)):
            dist = self.distance(row, self.X_train[i])
            distances.append((dist, self.y_train[i]))
        
        # Sorting the distance array 
        distances.sort()

        # Extracting first k neighbors
        neighbors = distances[:self.k]

        # Weight of neighbors
        weights = dict()
        
        # Calculating the Weights
        for i in range(len(neighbors)):
          
            label = neighbors[i][1]
            
            if weights.get(label) is not None:
                weights[label] += neighbors[i][0]
                
            else:
                weights[label] = neighbors[i][0]
        
        minWeight = -1
        answer_label = 0

        # Finding the label with minimum weight.
        # Weight of a label is basically the sum of all distances from datapoints of that label
        # This significantly decreases the chances of a Tie.
        for key, val in weights.items():
            if minWeight == -1 or minWeight > val:
                answer_label = key
                minWeight = val

        return answer_label
    
    # Function to find Euclidian distance between two feature vector
    def distance(self, a, b):
        return np.linalg.norm(a - b)

In [None]:
xtrain = []
ytrain = []
for doc in train_set:
    if doc in tf_idf_df.keys():
        xtrain.append(tf_idf_df[doc])
        ytrain.append(inv_classes_dict[doc])
        
xtest = []
ytest = []
for doc in test_set:
    if doc in tf_idf_df.keys():
        xtest.append(tf_idf_df[doc])
        ytest.append(inv_classes_dict[doc])

In [None]:
rocchio = Rocchio()
rocchio.fit(xtrain, ytrain)
prediction = rocchio.predict_E(xtest)

report = classification_report(prediction, ytest, zero_division=True)
print(report)

              precision    recall  f1-score   support

         701       0.00      1.00      0.00         0
         704       0.00      1.00      0.00         0
         711       0.00      1.00      0.00         0
         718       0.00      1.00      0.00         0
         724       0.00      1.00      0.00         0
         725       1.00      0.00      0.00         1
         729       0.00      1.00      0.00         0
         733       0.00      1.00      0.00         0
         736       0.00      1.00      0.00         0
         738       0.00      1.00      0.00         0
         740       1.00      0.11      0.20         9
         743       0.00      1.00      0.00         0
         744       1.00      0.50      0.67         4
         748       0.00      1.00      0.00         0
         749       0.00      1.00      0.00         0
         758       0.00      1.00      0.00         0
         765       0.00      1.00      0.00         0
         772       1.00    

In [None]:
knn_1 = KNNClassifier(1)
knn_1.fit(np.array(xtrain), np.array(ytrain))
prediction = knn_1.predict(np.array(xtest))
report = classification_report(prediction, ytest, zero_division=True)
print(report)

              precision    recall  f1-score   support

         701       0.00      1.00      0.00         0
         704       0.00      1.00      0.00         0
         711       0.00      1.00      0.00         0
         718       0.00      1.00      0.00         0
         724       0.00      1.00      0.00         0
         725       1.00      0.00      0.00         1
         728       1.00      0.00      0.00         1
         729       0.00      1.00      0.00         0
         733       0.00      1.00      0.00         0
         736       0.00      1.00      0.00         0
         738       0.00      1.00      0.00         0
         740       0.00      0.00      0.00         2
         742       1.00      0.00      0.00         1
         743       0.00      1.00      0.00         0
         744       0.50      1.00      0.67         1
         748       1.00      0.25      0.40         4
         749       0.00      1.00      0.00         0
         758       0.00    

In [None]:
knn_3 = KNNClassifier(3)
knn_3.fit(np.array(xtrain), np.array(ytrain))
prediction = knn_3.predict(np.array(xtest))
report = classification_report(prediction, ytest, zero_division=True)
print(report)

              precision    recall  f1-score   support

         701       0.00      1.00      0.00         0
         704       0.00      1.00      0.00         0
         711       0.00      1.00      0.00         0
         718       0.00      1.00      0.00         0
         724       0.00      1.00      0.00         0
         725       1.00      0.00      0.00         1
         728       1.00      0.00      0.00         1
         729       0.00      1.00      0.00         0
         733       0.00      1.00      0.00         0
         736       0.00      1.00      0.00         0
         738       0.00      1.00      0.00         0
         740       0.00      0.00      0.00         2
         742       1.00      0.00      0.00         1
         743       0.00      1.00      0.00         0
         744       0.50      1.00      0.67         1
         748       1.00      0.25      0.40         4
         749       0.00      1.00      0.00         0
         758       0.00    

In [None]:
knn_5 = KNNClassifier(5)
knn_5.fit(np.array(xtrain), np.array(ytrain))
prediction = knn_5.predict(np.array(xtest))
report = classification_report(prediction, ytest, zero_division=True)
print(report)

              precision    recall  f1-score   support

         701       0.00      1.00      0.00         0
         704       0.00      1.00      0.00         0
         711       0.00      1.00      0.00         0
         718       0.00      1.00      0.00         0
         724       0.00      1.00      0.00         0
         725       1.00      0.00      0.00         1
         729       0.00      1.00      0.00         0
         733       0.00      1.00      0.00         0
         736       0.00      1.00      0.00         0
         738       0.00      1.00      0.00         0
         740       0.00      0.00      0.00         2
         742       1.00      0.00      0.00         1
         743       0.00      1.00      0.00         0
         744       0.50      1.00      0.67         1
         748       0.00      0.00      0.00         2
         749       0.00      1.00      0.00         0
         758       0.00      1.00      0.00         0
         765       0.00    