In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%%capture
!pip install rank-bm25

In [3]:
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_documents, preprocess_string

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
csv_file_path = "/content/drive/MyDrive/IR_PROJECT_DATASET/ILDC_multi.csv"
df = pd.read_csv(csv_file_path)

In [19]:
df.head()

Unnamed: 0,text,label,split,name
0,"Uday Umesh Lalit, J. These appeals arise out ...",0,train,2020_1.txt
1,"Indira Banerjee, J. These appeals are against...",0,train,2020_2.txt
2,TABLE OF CONTENTS Introduction A Contentions B...,0,train,2020_3.txt
3,"Dinesh Maheshwari, J. Introductory with brief...",0,train,2020_4.txt
4,"Dinesh Maheshwari, J. Preliminary By way of t...",0,train,2020_5.txt


In [6]:
df.columns

Index(['text', 'label', 'split', 'name'], dtype='object')

In [7]:
def preprocess_text(text):
    text = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [8]:
df_tokens = df.text.fillna('').apply(preprocess_string)

In [9]:
tokens_path = "/content/drive/MyDrive/IR_PROJECT_DATASET/tokens.pkl"
with open(tokens_path, 'wb') as file:
    pickle.dump(df_tokens, file)

In [10]:
tokens_path = "/content/drive/MyDrive/IR_PROJECT_DATASET/tokens.pkl"
with open(tokens_path, 'rb') as file:
    df_tokens = pickle.load(file)

In [11]:
bm25_index = BM25Okapi(df_tokens.tolist())

In [15]:
def search(search_string, num_results=10):
    search_tokens = preprocess_string(search_string)
    scores = bm25_index.get_scores(search_tokens)
    top_indexes = np.argsort(scores)[::-1][:num_results]
    return top_indexes

In [32]:
indexes = search('snatch')
indexes

array([24793, 18491, 23986,  9391,   878, 26639, 10824, 11607, 12743,
       23254])

In [35]:
res_df = df.loc[indexes,['text', 'name']]

In [36]:
res_df

Unnamed: 0,text,name
24793,"M. Ahmadi, J. Special leave granted. Heard co...",1991_268.txt
18491,CRIMINAL APPELLATE JURISDICTION Criminal Appea...,1962_431.txt
23986,"Ratnavel Pandian, J. This appeal by special l...",1990_642.txt
9391,"CRIMINAL APPEAL NO. 927 OF 2006 P. MATHUR, J....",2007_930.txt
878,"BANUMATHI, J. Leave granted. The appellant ha...",2019_71.txt
26639,"Jayachandra Reddy, J. These two appeals arise...",1993_308.txt
10824,WITH CRIMINAL APPEAL NO. 74 OF 2006 Arising o...,2006_29.txt
11607,WITH CRIMINAL APPEAL NO. 578 OF 2004 SLP C...,2004_378.txt
12743,"ARIJIT PASAYAT, J. Appellant faced trial for ...",2003_1132.txt
23254,CRIMINAL APPELLATE JURISDICTION Criminal Appea...,1983_146.txt


In [37]:
res_df.to_csv('top_10.csv', index=False)