In [8]:
import json
import time
import datetime
import dateutil.parser
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
import re
import contractions

In [9]:
def isEndorsedByStaff(endorsements):
    for endorsement in endorsements:
        if 'role' in endorsement and ('professor' in endorsement['role'] or 'instructor' in endorsement['role'] or 'ta' in endorsement['role']):
            return True
    
def checkValidAnswer(post):
    return ('i_answer' in post['type']) or ('tag_endorse' in post and isEndorsedByStaff(post['tag_endorse']))

def getAnswerList(post):
    answerList = []
    if('children' in post):
        postAnswers = post['children']
        for postAnswer in postAnswers:
            answer = ''
            if 'type' in postAnswer and checkValidAnswer(postAnswer) and 'history' in postAnswer and 'subject' not in postAnswer['history']:
                last_modified = getLastModified(postAnswer)
                answer = last_modified['content']
                answerList.append(answer)
    return answerList

def getLastModified(post):
    history = post['history']
    last_modified_answer = history[0]
    last_modified_datetime = dateutil.parser.parse(history[0]['created'])
    for i in range(0, len(history)):
        post_datetime = dateutil.parser.parse(history[i]['created'])
        if(post_datetime > last_modified_datetime):
            last_modified_datetime = post_datetime
            last_modified_answer = history[i]
            
    return last_modified_answer

def extractData(filename):
    with open(filename, 'r') as openfile:
        input = json.load(openfile)
        df = pd.DataFrame(columns = ['Post', 'Sentence'])
        for i in range(0, len(input)):
            post = input[i]
            if 'history' in post:
                last_modified = getLastModified(post)
                if 'subject' in last_modified and 'content' in last_modified and '<img' not in last_modified['content']:
                    subject = last_modified['subject']
                    content = last_modified['content']
                    post_ID = post['nr']
                    answerList = getAnswerList(input[i])
                    df = df.append({'Post': post_ID, 'Sentence': subject + "." + content}, ignore_index = True)  
                    for i in range(0, len(answerList)):
                        df = df.append({'Post': post_ID, 'Sentence': answerList[i]}, ignore_index = True)

        return df

#Removing all contractions
def perform_contractions(series):
    series = series.apply(lambda x: contractions.fix(x))
    return series

def data_cleaning(data):

  #Convert to Lowercase
  data["Sentence"] = data["Sentence"].str.lower()

  #Remove all HTML tags
  data["Sentence"] = data["Sentence"].apply(lambda x: BeautifulSoup(str(x)).get_text())

  #Remove all URLs
  data["Sentence"] = data["Sentence"].apply(lambda x: re.sub(r'\s*(https?://|www\.)+\S+(\s+|$)', " ", str(x), flags=re.UNICODE))

  #Remove extra spaces
  data["Sentence"] = data["Sentence"].apply(lambda x: re.sub(r"\s+", " ", str(x), flags=re.UNICODE).strip())

  x = perform_contractions(data["Sentence"])
  data["Sentence"] = x

  #Removing uppercase letters which might be introduced after removing contractions
  data["Sentence"] = data["Sentence"].str.lower()

  data = data.drop_duplicates()
  data = data.reset_index(drop=True)
  return data




In [10]:
#extract
data = pd.concat([extractData("/content/drive/My Drive/CSCI 544 - Project/piazzaSmartSearch-main/Data/fall_22_nlp.json"), extractData("/content/drive/My Drive/CSCI 544 - Project/piazzaSmartSearch-main/Data/spring_22_nlp.json")])

# Clean
data = data_cleaning(data)