In [1]:
import xml.etree.ElementTree as ET
import sys
sys.path.append("/data/szr207/projects/ArqMath/ARQMathCode/")

In [2]:
!export PYTHONPATH="/data/szr207/projects/ArqMath/ARQMathCode/"

In [3]:
from Entity_Parser_Record.comment_parser_record import CommentParserRecord
from Entity_Parser_Record.post_link_parser_record import PostLinkParserRecord
from Entity_Parser_Record.post_parser_record import PostParserRecord
from Entity_Parser_Record.user_parser_record import UserParserRecord
from Entity_Parser_Record.vote_parser_record import VoteParserRecord
from Visualization.generate_html_file import HtmlGenerator
import argparse


class DataReaderRecord:
    """
        This is the data reader class for MSE ARQMath dataset.
        In the constructor, all the data is read and the related ones are linked together.
        We have provided several functions as examples of how to work with this data reader.
        Also if the participant will to generate the html file for a given thread (question), they can use the
        get_html_pages where they specify list of questions id for which they want to get the html.


        The main difference with the other DataReader is that each file is read record by record here.
    """

    def __init__(self, root_file_path):
        """
        This class read all the data file in MSE ARQMath Dataset. The root file of data is taken as the input
        and then each of the files are read and the related data are linked together.
        :param root_file_path: The root directory of MSE ARQMath Dataset.
        """
        post_file_path = root_file_path + "/Posts_V1_0.xml"
        badges_file_path = root_file_path + "/Badges.V1.0.xml"
        comments_file_path = root_file_path + "/Comments.V1.0.xml"
        votes_file_path = root_file_path + "/Votes.V1.0.xml"
        users_file_path = root_file_path + "/Users.V1.0.xml"
        post_links_file_path = root_file_path + "/PostLinks.V1.0.xml"

        print("reading users")
        self.user_parser = UserParserRecord(users_file_path, badges_file_path)
        print("reading comments")
        self.comment_parser = CommentParserRecord(comments_file_path)
        print("reading votes")
        self.vote_parser = VoteParserRecord(votes_file_path)
        print("reading post links")
        self.post_link_parser = PostLinkParserRecord(post_links_file_path)
        print("reading posts")
        self.post_parser = PostParserRecord(post_file_path, self.comment_parser.map_of_comments_for_post,
                                            self.post_link_parser.map_related_posts,
                                            self.post_link_parser.map_duplicate_posts,
                                            self.vote_parser.map_of_votes, self.user_parser.map_of_user)

    def get_list_of_questions_posted_in_a_year(self, year):
        """

        :param year:
        :return:
        """
        lst_of_question = []
        for question_id in self.post_parser.map_questions:
            question = self.post_parser.map_questions[question_id]
            if question.creation_date is None:
                continue
            creation_year = int(question.creation_date.split("T")[0].split("-")[0])
            if creation_year == year:
                lst_of_question.append(question)
        return lst_of_question

    def get_answers_for_question(self, question_id):
        """

        :param question_id:
        :return:
        """
        if question_id not in self.post_parser.map_questions:
            return None
        return self.post_parser.map_questions[question_id].answers

    def get_user(self, user_id):
        """

        :param user_id:
        :return:
        """
        if user_id not in self.user_parser.map_of_user:
            return None
        return self.user_parser.map_of_user[user_id]

    def get_answers_posted_by_user(self, user_id):
        """

        :param user_id:
        :return:
        """
        lst_of_answers = []
        for parent_id in self.post_parser.map_answers:
            lst_answer = self.post_parser.map_answers[parent_id]
            for answer in lst_answer:
                if answer.owner_user_id is not None:
                    if answer.owner_user_id == user_id:
                        lst_of_answers.append(answer)
        return lst_of_answers

    def get_question_of_tag(self, tag):
        """

        :param tag:
        :return:
        """
        lst_of_questions = []
        for question_id in self.post_parser.map_questions:
            question = self.post_parser.map_questions[question_id]
            lst_tags = question.tags
            if tag in lst_tags:
                lst_of_questions.append(tag)
        return lst_of_questions

    def get_html_pages(self, lst_of_questions_id, result_directory):
        """

        :param lst_of_questions_id:
        :param result_directory:
        :return:
        """
        HtmlGenerator.questions_to_html(lst_of_questions_id, self, result_directory)
    
    def get_duplicate(self, lst_of_questions_id, result_directory):
        """

        :param lst_of_questions_id:
        :param result_directory:
        :return:
        """
        HtmlGenerator.questions_to_html(lst_of_questions_id, self, result_directory)
    
# /data/szr207/dataset/ArqMath/V2/Badges.V1.0.xml
        
# parser = argparse.ArgumentParser(description='By setting the file path for MSE ARQMath Dataset,'
#                                              'One can iterate read the related data and go through questions')
# parser.add_argument('-ds', type=str, help="File path for the MSE ARQMath Dataset.", required=True)
# args = vars(parser.parse_args())

clef_home_directory_file_path = ('/data/szr207/dataset/ArqMath/V2/')
dr = DataReaderRecord(clef_home_directory_file_path)
lst_questions = dr.get_question_of_tag("calculus")
lst_answers = dr.get_answers_posted_by_user(132)
# dr.get_html_pages([1, 5], "../html_files")

reading users
reading comments
reading votes
reading post links


2243it [00:00, 22393.29it/s]

reading posts


2477487it [02:21, 17507.10it/s]


In [6]:
list_Q = dr.get_list_of_questions_posted_in_a_year(2010)

In [12]:
lst_answers = dr.get_answers_posted_by_user(2121)
lst_answers

[]

## Things to ask 

In [4]:
# attributes = list_Q[0].__dir__()[:24]
import jsonlines
from tqdm.notebook import tqdm 
import os, uuid
import re 
import glob

root_path = "/data/szr207/dataset/ArqMath/jsons/answers/"

In [None]:
year = 2010
list_Q = dr.get_list_of_questions_posted_in_a_year(year)
for i in tqdm(list_Q):
    dict_q = i.__dict__
    try:
        for ans in dr.get_answers_for_question(dict_q['post_id']):
            ans_dict = ans.__dict__
            if ans_dict["votes"]:
                num_votes = len(ans_dict["votes"])
                ans_dict["votes"] = num_votes
            else:
                ans_dict["votes"] = 0
            del ans_dict["user"]
            del ans_dict["comments"]
#                 print(ans_dict)
            print(ans_dict)
            break
    except:
        continue


In [14]:
for year in [2010,2011,2012,2013,2014,2015,2016,2017,2018]:
    list_Q = dr.get_list_of_questions_posted_in_a_year(year)
    with jsonlines.open(str(year)+'_ans.jsonl', mode='w') as writer:
        for i in tqdm(list_Q):
            dict_q = i.__dict__
            try:
                for ans in dr.get_answers_for_question(dict_q['post_id']):
                    ans_dict = ans.__dict__
                    if ans_dict["votes"]:
                        num_votes = len(ans_dict["votes"])
                        ans_dict["votes"] = num_votes
                    else:
                        ans_dict["votes"] = 0
                    del ans_dict["user"]
                    del ans_dict["comments"]
    #                 print(ans_dict)
                    writer.write(ans_dict)
            except:
                continue
    #         ans = []
    #         for
    #         writer.write(dict_q)

HBox(children=(FloatProgress(value=0.0, max=4564.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26454.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=58148.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=118129.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=149771.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=167657.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=165434.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=174269.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=165023.0), HTML(value='')))




In [15]:
dict_question = list_Q[0].__dict__
# del dict_question["votes"]
# del dict_question["user"]
dict_question

{'post_id': 2587133,
 'post_type': 1,
 'creation_date': '2018-01-01T00:09:16.293',
 'score': 0,
 'view_count': 669,
 'body': '<p>The task is to compute the area of a triangle in a 3 dimensional space who is defined by those 3 points:</p>  <p><span class="math-container" id="23922459">A=(1,1,1),  B=(1,2,0), C=(2,1,2)</span></p>  <p>The problem is that the formula I know would compute me the volume of the triangle.  The formula is the determinant of <span class="math-container" id="23922460">([A][B][C])</span> where <span class="math-container" id="23922461">[V]</span> represents V in the standard base. The area of a triangle is computed as half the area of a parallelogram.</p> ',
 'owner_user_id': 477127,
 'comment_count': 0,
 'last_edit_date': None,
 'last_activity_date': None,
 'last_editor_user_id': None,
 'community_owned_date': None,
 'last_editor_display_name': None,
 'comments': None,
 'votes': None,
 'user': <Entities.User.User at 0x7f40c8d95510>,
 'related_post': [(128991, Fals

In [16]:
from elasticsearch import Elasticsearch

es = Elasticsearch(['http://csxindex05:9200/'], verify_certs=True)

if not es.ping():
    raise ValueError("Connection failed")

In [94]:
from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch import helpers

es = Elasticsearch(['http://csxindex05:9200/'], verify_certs=True)

In [18]:
list_a = []
with jsonlines.open(os.path.join(root_path,'2018_ans.jsonl')) as reader:
        for obj in reader:
            if obj['body']:
                obj['body'] = re.sub('<[^<]+?>', '',  obj['body'])
            dict_item = {
                    "_index": "answer_bulk_index",
                    "_id": uuid.uuid4(),
                    "_source": obj }
            list_a.append(dict_item)

In [19]:
helpers.bulk(es, list_a)

(201902, [])

In [20]:
for file_name in tqdm(['2011_ans.jsonl','2012_ans.jsonl','2013_ans.jsonl','2014_ans.jsonl','2015_ans.jsonl','2016_ans.jsonl','2017_ans.jsonl']) :
    list_a = []
    with jsonlines.open(os.path.join(root_path,file_name)) as reader:
            for obj in reader:
                if obj['body']:
                    obj['body'] = re.sub('<[^<]+?>', '',  obj['body'])
                dict_item = {
                        "_index": "answer_bulk_index",
                        "_id": uuid.uuid4(),
                        "_source": obj }
                list_a.append(dict_item)
    helpers.bulk(es, list_a)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [98]:
# for year in tqdm([2010,2011,2012,2013,2014,2015,2016,2017,2018]):
for year in tqdm([2019]):
    list_Q = dr.get_list_of_questions_posted_in_a_year(year)
    with jsonlines.open(os.path.join(root_path,str(year)+'_ques.jsonl'), mode='w') as writer:
        for i in tqdm(list_Q):
            dict_q = i.__dict__
            list_ans = []
#             try:
#             if isinstance(dr.get_answers_for_question(dict_q['post_id']), int):
#                 dict_q['answers'] = [dr.get_answers_for_question(dict_q['post_id'])]
#             else:
# #                 print(dr.get_answers_for_question(dict_q['post_id']))
#             print(dict_q)
            ans = dr.get_answers_for_question(dict_q['post_id'])
            if ans:
                for ans in dr.get_answers_for_question(dict_q['post_id']):
                    list_ans.append(ans.__dict__['post_id'])
                dict_q['answers'] = list_ans
            else:
                dict_q['answers'] = None
            del dict_q["user"]
            del dict_q["votes"]
            del dict_q["related_post"]
            del dict_q["comments"]
#             print(dict_q)
            writer.write(dict_q)
            
#             except:
#                 print("exception")
#                 continue

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))





In [97]:
# for year in tqdm([2010,2011,2012,2013,2014,2015,2016,2017,2018,2019]):
for year in tqdm([2019]):
    list_Q = dr.get_list_of_questions_posted_in_a_year(year)
    with jsonlines.open(os.path.join(root_path,str(year)+'_ques.jsonl'), mode='w') as writer:
        for i in tqdm(list_Q):
            dict_q = i.__dict__
            list_ans = []
            try:
                if dr.get_answers_for_question(dict_q['post_id']):
                    for ans in dr.get_answers_for_question(dict_q['post_id']):
                        list_ans.append(ans.__dict__['post_id'])
                    dict_q['answers'] = list_ans
    #                 del dict_q["user"]
    #                 del dict_q["votes"]
    #                 del dict_q["related_post"]
    #                 wri(dict_q)
                    writer.write(dict_q)
                else:
                    continue
            except Exception as e:
                print(dict_q)
                print(e)
                break
#                 print("exception")
#                 continue

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))





In [96]:
root_path = '/data/szr207/dataset/ArqMath/jsons/questions'

for file_name in tqdm(['2010_ques.jsonl', '2011_ques.jsonl','2012_ques.jsonl','2013_ques.jsonl','2014_ques.jsonl','2015_ques.jsonl','2016_ques.jsonl','2017_ques.jsonl','2018_ques.jsonl']) :
    list_a = []
    with jsonlines.open(os.path.join(root_path,file_name)) as reader:
            for obj in reader:
                if obj['body']:
                    obj['body'] = re.sub('<[^<]+?>', '',  obj['body'])
                if obj['title']:
                    obj['title'] = re.sub('<[^<]+?>', '',  obj['title'])
                dict_item = {
                        "_index": "question_bulk_index",
                        "_id": uuid.uuid4(),
                        "_source": obj }
                list_a.append(dict_item)
    helpers.bulk(es, list_a)

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


