Name: Rishi Chhabra
Subject : Text Mining and Infomation Retrieval
Date: 10/24/2024


Loading the Datasets

In [14]:

! git clone https://github.com/guanqun-yang/cs589assignment1.git
! unzip "cs589assignment1/dataset/*.zip" -d cs589assignment1/dataset
! mv cs589assignment1/dataset/java .
! mv cs589assignment1/dataset/python .
! mv cs589assignment1/dataset/javascript .


Cloning into 'cs589assignment1'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 23 (delta 0), reused 2 (delta 0), pack-reused 21 (from 1)[K
Receiving objects: 100% (23/23), 109.22 MiB | 14.03 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Archive:  cs589assignment1/dataset/javascript.zip
   creating: cs589assignment1/dataset/javascript/
  inflating: cs589assignment1/dataset/javascript/javascript_test_qid.txt  
  inflating: cs589assignment1/dataset/javascript/javascript_qid2all.txt  
  inflating: cs589assignment1/dataset/javascript/javascript_cosidf.txt  

Archive:  cs589assignment1/dataset/java.zip
   creating: cs589assignment1/dataset/java/
  inflating: cs589assignment1/dataset/java/java_test_qid.txt  
  inflating: cs589assignment1/dataset/java/java_cosidf.txt  
  inflating: cs589assignment1/dataset/java/java_qid2all.txt  

Archive:  cs589assignment1/dataset/python.zip
   cre

Commandline Commands for Running Kibana and Elastic Search

In [15]:
# Using kibana-7.9.0 and elastic search-7.9.0
# (base) rishichhabra@MacBookAir Assignment-2 % cd kibana-7.9.0-darwin-x86_64/bin
# ./kibana      
# (base) rishichhabra@MacBookAir Assignment-2 % cd elasticsearch-7.9.0/bin       
# ./elasticsearch

In [16]:
# DLM - Dirichlet Language Model configuration
DLM = {
    "analysis": {
        "analyzer": {
            "my_analyzer": {
                "tokenizer": "whitespace",
                "filter": [
                    "lowercase",
                    "porter_stem"
                ]
            }
        }
    },
    "index": {
        "similarity": {
            "default": {
                "type": "LMDirichlet"
            }
        }
    }
}

# BM25 configuration
BM25 = {
    "analysis": {
        "analyzer": {
            "my_analyzer": {
                "tokenizer": "whitespace",
                "filter": [
                    "lowercase",
                    "porter_stem"
                ]
            }
        }
    },
    "index": {
        "similarity": {
            "default": {
                "type": "BM25"
            }
        }
    }
}

# TF-IDF configuration
TFIDF = {
    "number_of_shards": 1,
    "analysis": {
        "analyzer": {
            "my_analyzer": {
                "tokenizer": "whitespace",
                "filter": [
                    "lowercase",
                    "porter_stem"
                ]
            }
        }
    },
    "index": {
        "similarity": {
            "scripted_tfidf": {
                "type": "scripted",
                "weight_script": {
                    "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;"
                },
                "script": {
                    "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;"
                }
            }
        }
    }
}

map = {
    "properties": {
        "title": {
            "type": "text",
            "analyzer": "my_analyzer"
        },
        "body": {
            "type": "text",
            "analyzer": "my_analyzer"
        },
        "answer": {
            "type": "text",
            "analyzer": "my_analyzer"
        }
    }
}

# Base URL for Elasticsearch
baseURL = "http://localhost:9200/"
# Languages to be indexed
langs = ["python", "java", "javascript"]
indices = {"dlm": DLM, "bm25": BM25, "tfidf": TFIDF}

# Dirichlet Language Model (DLM)
# This model estimates the likelihood of a query being generated from a document.
# It uses Dirichlet smoothing to adjust for zero-frequency terms in the document.
# Smoothing ensures that unseen words get a non-zero probability.

# BM25 (Best Matching 25)
# BM25 is a probabilistic ranking function that ranks documents based on the term frequency (TF),
# inverse document frequency (IDF), and document length normalization.
# It’s a common algorithm for document retrieval, balancing relevance by penalizing overuse of terms.

# TF-IDF (Term Frequency-Inverse Document Frequency)
# TF-IDF weighs terms based on their frequency in a document relative to their frequency across all documents.
# It assigns higher importance to rare terms. This version uses scripted fields for flexibility in custom implementations.


In [17]:
import requests
response = requests.delete(baseURL+"_all")
print(response.json())
for lang in langs:
    for name, settings in indices.items():
        url = baseURL+lang+"_"+name
        data = {"settings": settings, "mappings": map}
        response = requests.put(url, json=data)
        print(response.json())

{'acknowledged': True}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python_dlm'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python_bm25'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python_tfidf'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'java_dlm'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'java_bm25'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'java_tfidf'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'javascript_dlm'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'javascript_bm25'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'javascript_tfidf'}


In [18]:
import pandas as pd
import json
import math
from tqdm.autonotebook import tqdm
import os

qids_missing_data = {}
langs = ["python", "java", "javascript"]

for lang in tqdm(langs):
    qidmap = pd.read_csv(f"{lang}/{lang}_qid2all.txt", sep="\t", header=None)
    qidmap.columns = ["id", "title", "body", "answer"]
    qids_missing_data[lang] = qidmap[qidmap.isnull().any(axis=1)]["id"].tolist()
    qidmap.dropna(inplace=True)
    half = len(qidmap) // 2
    with open(f"{lang}1.json", "w") as ljson:
        json_lines = []
        for row in tqdm(qidmap.iloc[:half].itertuples(index=False), leave=False):
            json_lines.append(json.dumps({"index": {"_id": row.id}}) + "\n")
            json_lines.append(json.dumps({col: getattr(row, col) for col in qidmap.columns if col != "id"}) + "\n")
        ljson.writelines(json_lines)
    with open(f"{lang}2.json", "w") as ljson:
        json_lines = []
        for row in tqdm(qidmap.iloc[half:].itertuples(index=False), leave=False):
            json_lines.append(json.dumps({"index": {"_id": row.id}}) + "\n")
            json_lines.append(json.dumps({col: getattr(row, col) for col in qidmap.columns if col != "id"}) + "\n")
        ljson.writelines(json_lines)


# Initialize empty data structures and define the programming languages.
# For each language:
# Read the data from a file (qid2all.txt).
# Find any rows with missing data and store the corresponding IDs.
# Remove rows with missing data.
# Split the data into two halves.
# Creating two JSON files for Elasticsearch bulk indexing:
# Each JSON file contains an indexing command followed by the document's content for each row, structured for bulk operations.

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Using curl to Bulk indexes

In [19]:
for lang in tqdm(langs):
    for index in tqdm(indices.keys(), leave=False):
        for i in range(1,3):
            url = baseURL+lang+"_"+index+"/_doc/_bulk"
            file = "@" + lang + str(i) + ".json"
            !curl -s -H "Content-Type: application/json" -XPOST {url} --data-binary {file} >> curl.log
print("Run Success!!!!!!")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Run Success!!!!!!


Ranking Function for computing ratings for each language

In [20]:
import os
for lang in tqdm(langs):
    for index in tqdm(indices.keys(), leave=False):
        os.makedirs(lang+"_"+index+"_ratings/", exist_ok=True)
        cosidf = pd.read_csv(lang+"/"+lang+"_cosidf.txt", sep="\t", header=0)
        for qid1 in tqdm(cosidf.qid1.unique(), leave=False):
            if qid1 in qids_missing_data[lang]:
                continue
            ratings = []
            qid2s = cosidf.loc[cosidf.qid1 == qid1]
            for qid2 in qid2s.itertuples():
                if qid2 in qids_missing_data[lang]:
                    continue
                ratings.append({"_index": lang+"_"+index, "_id":str(qid2.qid2), "rating": int(qid2.label)})
            with open(lang+"_"+index+"_ratings/"+str(qid1)+".json","w") as rf:
                rf.seek(0)
                rf.truncate(0)
                json.dump(ratings, rf, indent=1)
    

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

NCDG@10 function

In [21]:
from elasticsearch import Elasticsearch
import numpy as np

def ranking(qid1, qid1_title, ratings):
    _search = {
        "requests": [
            {
                "id": str(qid1),
                "request": {
                    "query": {
                        "bool": {
                            "must_not": {
                                "match": {
                                    "_id": qid1
                                }
                            },
                            "should": [
                                {
                                    "match" : {
                                        "title": {
                                            "query": qid1_title,
                                            "boost": 3.0,
                                            "analyzer": "my_analyzer"
                                }}},
                                {
                                    "match" : {
                                        "body": {
                                            "query": qid1_title,
                                            "boost": 3.0,
                                            "analyzer": "my_analyzer"
                                }}},
                                {
                                    "match" : {
                                        "answer": {
                                            "query": qid1_title,
                                            "boost": 3.0,
                                            "analyzer": "my_analyzer"
                                }}}
                            ]}}
                },
                "ratings": ratings
            }
        ],
        "metric":{
            "dcg":{
               "k": 10,
               "normalize": True 
            }
        }
    }
    return _search

def ndcg(index, lang):
    cosidf = pd.read_csv(lang+"/"+lang+"_cosidf.txt", sep="\t", header=0)
    es = Elasticsearch()
    ndcg_list=[]
    for qid1 in tqdm(cosidf.qid1.unique(),leave=False):
        if qid1 in qids_missing_data[lang]:
            continue
        qid1_title = es.get(index=lang+"_"+index, doc_type="_doc", id=qid1)["_source"]["title"]
        with open(lang+"_"+index+"_ratings/"+str(qid1)+".json","r") as rf:
            ratings = json.load(rf)
            _search = ranking(qid1, qid1_title, ratings)
            result = es.rank_eval(index=lang+"_"+index, body=_search)
            ndcg = result["metric_score"]
            ndcg_list.append(ndcg)
    return np.mean(np.array(ndcg_list))

Report Generation

In [22]:
with open("report.txt", "w+") as report:
    report.seek(0)
    report.truncate(0)
    report.write("Name: Rishi Chhabra\n")
    report.write("Date: 10/24/2024\n")
    for lang in tqdm(langs):
        report.write("Lang: "+lang+"\n")
        for index in tqdm(indices.keys(), leave=False):
            report.write("\t"+index+": NDCG@10="+str(ndcg(index, lang))+"\n")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]