Name: Aidan Fischer

Date: 10/18/2023

I pledge my honor that I have abided by the Stevens Honor System

The code in this notebook has been written to be repeatable (generated data/files are reset before being regenerated)

In [6]:
# Make sure elasticsearch package is added
%pip install elasticsearch==7.9.1

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Define indices

DLM = {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    },
    "index": {
      "similarity": {
        "default": {
          "type": "LMDirichlet"
        }
      }
    }
  }

BM25 = {
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    },
    "index": {
      "similarity": {
        "default": {
          "type": "BM25"
        }

      }
    }
  }

TFIDF = {
    "number_of_shards": 1,
    "analysis": {
      "analyzer": {
        "my_analyzer": {
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    },
    "index": {
      "similarity": {
        "scripted_tfidf": {
          "type": "scripted",
          "weight_script": {
            "source": "double idf = Math.log((field.docCount+1.0)/(term.docFreq+1.0)) + 1.0; return query.boost * idf;"
          },
          "script": {
            "source": "double tf = Math.sqrt(doc.freq); double norm = 1/Math.sqrt(doc.length); return weight * tf * norm;"
          }
        }
      }
    }
  }

map = {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "my_analyzer"
      },
      "body": {
        "type": "text",
        "analyzer": "my_analyzer"
      },
      "answer": {
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }

baseURL = "http://localhost:9200/"

langs = ["python","java","javascript"]

indices = {"dlm": DLM, "bm25": BM25, "tfidf": TFIDF}

In [8]:
import requests

# Use requests to delete existing indices and put indices on server

response = requests.delete(baseURL+"_all")
print(response.json())

for lang in langs:
    for name, settings in indices.items():
        url = baseURL+lang+"_"+name
        data = {"settings": settings, "mappings": map}
        response = requests.put(url, json=data)
        print(response.json())

{'acknowledged': True}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python_dlm'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python_bm25'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python_tfidf'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'java_dlm'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'java_bm25'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'java_tfidf'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'javascript_dlm'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'javascript_bm25'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'javascript_tfidf'}


In [9]:
import pandas as pd
import json
import math
import numpy as np
from tqdm.autonotebook import tqdm, trange
# Create JSON files from dataset

qids_missing_data = {}

for lang in tqdm(langs):
    qidmap = pd.read_csv(lang+"/"+lang+"_qid2all.txt", sep="\t", header=None)
    qidmap.columns=["id", "title", "body", "answer"]
    qids_missing_data.update({f"{lang}": []})
    for row in qidmap[qidmap.isnull().any(axis=1)].itertuples():
        qids_missing_data[lang].append(row.id)
    qidmap.dropna(inplace=True)
    half = int(math.floor(len(qidmap.index)/2))
    with open(lang+"1.json","w+") as ljson:
        ljson.seek(0)
        ljson.truncate(0)
        for row in tqdm(qidmap.iloc[:half].itertuples(index=False), leave=False):
            ljson.write(json.dumps({"index": {"_id": row.id}}) + "\n")
            ljson.write(json.dumps({col: getattr(row, col) for col in qidmap.columns if col != "id"}) + "\n")
    with open(lang+"2.json","w+") as ljson:
        ljson.seek(0)
        ljson.truncate(0)
        for row in tqdm(qidmap.iloc[half:].itertuples(index=False), leave=False):
            ljson.write(json.dumps({"index": {"_id": row.id}}) + "\n")
            ljson.write(json.dumps({col: getattr(row, col) for col in qidmap.columns if col != "id"}) + "\n")

  from tqdm.autonotebook import tqdm, trange


  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [10]:
# Use curl to bulk index.
for lang in tqdm(langs):
    for index in tqdm(indices.keys(), leave=False):
        for i in trange(1,3, leave=False):
            url = baseURL+lang+"_"+index+"/_doc/_bulk"
            file = "@" + lang + str(i) + ".json"
            !curl -s -H "Content-Type: application/json" -XPOST {url} --data-binary {file} >> curl.log
print("Done!")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Done!


In [12]:
# Compute ratings for each language

import os

for lang in tqdm(langs):
    for index in tqdm(indices.keys(), leave=False):
        os.makedirs(lang+"_"+index+"_ratings/", exist_ok=True)
        cosidf = pd.read_csv(lang+"/"+lang+"_cosidf.txt", sep="\t", header=0)
        for qid1 in tqdm(cosidf.qid1.unique(), leave=False):
            # Skip qids that we deleted due to missing columns. 
            if qid1 in qids_missing_data[lang]:
                continue
            ratings = []
            qid2s = cosidf.loc[cosidf.qid1 == qid1]
            for qid2 in qid2s.itertuples():
                # Skip qids that we deleted due to missing columns. 
                if qid2 in qids_missing_data[lang]:
                    continue
                ratings.append({"_index": lang+"_"+index, "_id":str(qid2.qid2), "rating": int(qid2.label)})
            with open(lang+"_"+index+"_ratings/"+str(qid1)+".json","w") as rf:
                rf.seek(0)
                rf.truncate(0)
                json.dump(ratings, rf, indent=1)
    

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

In [13]:
# NCDG@10 function
from elasticsearch import Elasticsearch
import numpy as np

def ranking(qid1, qid1_title, ratings):
    _search = {
        "requests": [
            {
                "id": str(qid1),
                "request": {
                    "query": {
                        "bool": {
                            "must_not": {
                                "match": {
                                    "_id": qid1
                                }
                            },
                            "should": [
                                {
                                    "match" : {
                                        "title": {
                                            "query": qid1_title,
                                            "boost": 3.0,
                                            "analyzer": "my_analyzer"
                                }}},
                                {
                                    "match" : {
                                        "body": {
                                            "query": qid1_title,
                                            "boost": 3.0,
                                            "analyzer": "my_analyzer"
                                }}},
                                {
                                    "match" : {
                                        "answer": {
                                            "query": qid1_title,
                                            "boost": 3.0,
                                            "analyzer": "my_analyzer"
                                }}}
                            ]}}
                },
                "ratings": ratings
            }
        ],
        "metric":{
            "dcg":{
               "k": 10,
               "normalize": True 
            }
        }
    }
    return _search

def ndcg(index, lang):
    cosidf = pd.read_csv(lang+"/"+lang+"_cosidf.txt", sep="\t", header=0)
    es = Elasticsearch()
    ndcg_list=[]
    for qid1 in tqdm(cosidf.qid1.unique(),leave=False):
        if qid1 in qids_missing_data[lang]:
            continue
        qid1_title = es.get(index=lang+"_"+index, doc_type="_doc", id=qid1)["_source"]["title"]
        with open(lang+"_"+index+"_ratings/"+str(qid1)+".json","r") as rf:
            ratings = json.load(rf)
            _search = ranking(qid1, qid1_title, ratings)
            result = es.rank_eval(index=lang+"_"+index, body=_search)
            ndcg = result["metric_score"]
            ndcg_list.append(ndcg)
    return np.mean(np.array(ndcg_list))

In [14]:
# Generate report
with open("report.txt", "w+") as report:
    report.seek(0)
    report.truncate(0)
    report.write("Name: Aidan Fischer\n")
    report.write("Date: 10/18/2023\n")
    report.write("I pledge my honor that I have abided by the Stevens Honor System\n\n")
    report.write("Note: All commands run as part of this homework were executed through the submitted Python notebook. \n\n")
    for lang in tqdm(langs):
        report.write("Lang: "+lang+"\n")
        for index in tqdm(indices.keys(), leave=False):
            report.write("\t"+index+": NDCG@10="+str(ndcg(index, lang))+"\n")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/6410 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/8448 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]

  0%|          | 0/8069 [00:00<?, ?it/s]