In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 2.8 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 21.0 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-1.4.4 textsearch-0.0.24


In [4]:
import json
import time
import datetime
import dateutil.parser
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
import re
import contractions

In [27]:
def isEndorsedByStaff(endorsements):
    for endorsement in endorsements:
        if 'role' in endorsement and ('professor' in endorsement['role'] or 'instructor' in endorsement['role'] or 'ta' in endorsement['role']):
            return True

def checkValidAnswer(post):
    return ('i_answer' in post['type']) or ('tag_endorse' in post and isEndorsedByStaff(post['tag_endorse']))

def getAnswerList(post):
    answerList = []
    if('children' in post):
        postAnswers = post['children']
        for postAnswer in postAnswers:
            #whoAnswered = ''
            answer = ''
            if 'type' in postAnswer and checkValidAnswer(postAnswer) and 'history' in postAnswer and 'subject' not in postAnswer['history']:
                #whoAnswered = postAnswer['type']
                #last_modified = postAnswer['history'][len(postAnswer['history']) - 1]
                last_modified = getLastModified(postAnswer)
                answer = last_modified['content']
                #answerToWhoAnswered = (answer, whoAnswered)
                answerList.append(answer)
                        
    return answerList

def getLastModified(post):
    history = post['history']
    last_modified_answer = history[0]
    last_modified_datetime = dateutil.parser.parse(history[0]['created'])
    for i in range(0, len(history)):
        post_datetime = dateutil.parser.parse(history[i]['created'])
        if(post_datetime > last_modified_datetime):
            last_modified_datetime = post_date
            last_modified_answer = history[i]
            
    return last_modified_answer

def extractDataForEvaluation(filename):
    with open(filename, 'r') as openfile:
        input = json.load(openfile)
        #print(input)
        df = pd.DataFrame(columns = ['Post','Link','Question','AnswerList'])
        for i in range(0, len(input)):
            post = input[i]
            if 'history' in post:
                last_modified = getLastModified(post)
                if 'subject' in last_modified and 'content' in last_modified:
                    subject = last_modified['subject']
                    content = last_modified['content']
                    post_ID = post['nr']
                    question_link = post['question_link']
                    answerList = getAnswerList(input[i])
                    df = df.append({'Post': post_ID, 'Link': question_link,'Question': subject + "." + content, 'AnswerList': answerList}, ignore_index = True)  
        return df

#Removing all contractions
def perform_contractions(series):
    series = series.apply(lambda x: contractions.fix(x))
    return series

def data_cleaning(data):

  #Convert to Lowercase
  data["Question"] = data["Question"].str.lower()

  #Remove all HTML tags
  data["Question"] = data["Question"].apply(lambda x: BeautifulSoup(str(x)).get_text())


  #Remove all URLs
  data["Question"] = data["Question"].apply(lambda x: re.sub(r'\s*(https?://|www\.)+\S+(\s+|$)', " ", str(x), flags=re.UNICODE))

  #Remove extra spaces
  data["Question"] = data["Question"].apply(lambda x: re.sub(r"\s+", " ", str(x), flags=re.UNICODE).strip())


  x = perform_contractions(data["Question"])
  data["Question"] = x

  #Removing uppercase letters which might be introduced after removing contractions
  data["Question"] = data["Question"].str.lower()

  data = data.reset_index(drop=True)

  for i in range(0,len(data['AnswerList'])):
    for j in range (0, len(data['AnswerList'][i])):
      data['AnswerList'][i][j] = data['AnswerList'][i][j].lower()
      data['AnswerList'][i][j] = BeautifulSoup(data['AnswerList'][i][j]).get_text()
      data['AnswerList'][i][j] = re.sub(r'\s*(https?://|www\.)+\S+(\s+|$)', " ", data['AnswerList'][i][j], flags=re.UNICODE)
      data['AnswerList'][i][j] = re.sub(r"\s+", " ", data['AnswerList'][i][j], flags=re.UNICODE).strip()
      data['AnswerList'][i][j] = contractions.fix(data['AnswerList'][i][j])

  return data

In [28]:
evaluation_data = df=pd.concat([extractDataForEvaluation("Data/fall_2022_nlp.json"), 
                    extractDataForEvaluation("Data/spring_2022_nlp.json"),
                  extractDataForEvaluation("Data/image_spring_2022_nlp.json"),
                  extractDataForEvaluation("Data/image_fall_2022_nlp.json")])

# Clean
evaluation_data = data_cleaning(evaluation_data)

In [29]:
from dataextracter import *

train_data=pd.concat([extractData("Data/fall_2022_nlp.json"), 
                    extractData("Data/spring_2022_nlp.json"),
                  extractData("Data/image_spring_2022_nlp.json"),
                  extractData("Data/image_fall_2022_nlp.json")])

# Clean
train_data = data_cleaning(train_data)



  data["Sentence"] = data["Sentence"].apply(lambda x: BeautifulSoup(str(x)).get_text())


In [30]:
!pip --q install sentence_transformers torch
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

In [38]:
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
#Load language model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(train_data['Sentence'], convert_to_tensor=True)

In [42]:
top_k = 5
sum_score = 0
for i in range (0, len(evaluation_data['Question'])):
    query = evaluation_data['Question'][i]
    print("\n\n======================\n\n")
    print("Query:", query)

    if(not evaluation_data['AnswerList'][i]):
      continue

    expected_answer = evaluation_data['AnswerList'][i][0]

    print("Expected Answer:", expected_answer)

    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\nTop most similar posts:")
    posts = []
    for score, idx in zip(top_results[0], top_results[1]):
      #Skip, if this post is already recommended
      postLink=train_data.iloc[idx.item()]['Link']
      # if post not in posts_seen:
      #   posts_seen.add(post)
      
      if(int(score) != 1):
        posts.append(train_data.iloc[idx.item()]['Sentence'])
      #print("\nPost: {}\nDocument: {}\nScore: {:.4f}".format(postLink,,score))
    print(posts)
    
    if(expected_answer == posts[0]):
      sum_score+=1

    if(expected_answer == posts[1]):
      sum_score+=0.8

    if(expected_answer == posts[2]):
      sum_score+=0.6

    if(expected_answer in posts):
      sum_score+=0.5

print(sum_score/len(evaluation_data['Question']))





Query: quiz. 9 - neural language models use word-embedding models in their training..hi , when taught in class it was mentioned that a by product of neural language models was the word embeddings but i do not remember word embeddings to be involved in training. please can someone confirm this. thanks
Expected Answer: yes, word embedding are not used to rain word2vec. we only use one-hot vectors and then learn word embeddings as the weights of the two-layer network.

Top most similar posts:
['no, that question is not about the two-layer network we used to learn word embeddings. that network itself never is used as a neural language modelf. the question is about neural language models that receive word embeddings as their input. in short, you need to pay attention to the network that the question is asking about.', 'so the question in quiz 9 should be false? the answer was true for the question: neural language models use word-embedding models in their training.', 'yes, word embeddin