In [130]:
import numpy as np
import pandas as pd
import pickle
from itertools import chain

In [120]:
with open('./all_posts.pkl', 'rb') as f:
    posts = pickle.load(f)

0. Default Index
1. Post cid: the @cid number
2. Type (Post | Instructor Note): 0 if student initiated post; 1 if instrcutor note
3. Subject
4. Content
5. Student Answer
6. Instructor Answer
7. Followup (Piled in a big string)

In [157]:
# %load eecs-instructor-answers/answer_finder.py
import html2text

def parse(x):
    return html2text.html2text(x).replace('\n',' ')

def _is_instructor_note(post):
    return 'instructor-note' in post['tags']

def _get_cid(post):
    return post['nr']

def extract(post):
    """
    0. Post time
    1. Post cid: the @cid number
    2. Type (Post | Instructor Note): 0 if student initiated post; 1 if instrcutor note
    3. Subject
    4. Content
    5. Student Answer
    6. Instructor Answer
    7. Followup (Piled in a big string)
    """
    return {
        'time': _get_post_time(post),
        'cid': _get_cid(post),
        'type': int(_is_instructor_note(post)),
        'subject': _get_post_subject(post),
        'content': _get_post_content(post),
        's_answer': _get_student_answer(post),
        'i_answer': _get_instructor_answer(post),
        'followup': _get_followup_joined(post)
    }


def _get_post_subject(post):
    return parse(post['history'][0]['subject'])

def _get_post_content(post):
    return parse(post['history'][0]['content'])

def _get_post_time(post):
    return post['history'][0]['created']

def _get_student_answer(post):
    s_answers = [ans for ans in post['children'] if ans['type'] == 's_answer']
    if len(s_answers) == 0:
        return None
    else:
        return parse(s_answers[0]['history'][0]['content'])

def _get_instructor_answer(post):
    i_answers = [ans for ans in post['children'] if ans['type'] == 'i_answer']
    if len(i_answers) == 0:
        return None
    else:
        return parse(i_answers[0]['history'][0]['content'])
    
def _get_followup_joined(post):
    return '\n'.join(chain.from_iterable(_get_followup(post)))
    
def _get_followup(post):
    follow_ups = [f for f in post['children'] if f['type'] == 'followup']
    
    result = []
    for f in follow_ups:        
        f_subject = parse(f['subject'])
        f_children = f['children'][::-1]
        
        if f['children'] != []:
            children_subjects = []
            for c in f_children:
                c_subject = parse(c['subject'])  
                children_subjects.append(c_subject)
            result.append([f_subject] + children_subjects)
        else:
            result.append([f_subject])
    return result

In [159]:
df = pd.DataFrame([extract(post) for post in posts])

In [160]:
df.head()

Unnamed: 0,cid,content,followup,i_answer,s_answer,subject,time,type
0,5851,"Hi everyone, I hope you are enjoying winter br...",I completely agree with this post. Lately the ...,,,Piazza Etiquette,2017-12-31T08:52:02Z,0
1,5821,**Help the EECS Department & Win Amazon Gift C...,,,,12.22.2017 Undergraduate Announcements,2017-12-22T17:42:33Z,1
2,5790,**The CS side did such a nice job of breaking ...,,,,EE/EECS Course Enrollment Tips,2017-12-18T20:58:02Z,1
3,5750,The CS Advising Office will be closed for advi...,,,,LSCS Winter Schedule,2017-12-11T17:18:29Z,1
4,5737,"For the CS major, students may transfer up to ...",Is this for EECS upper divs? Or does it also l...,,,new policy regarding transferring study abroad...,2017-12-11T16:39:53Z,1


In [161]:
df['time'] = pd.DatetimeIndex(df['time'])
df.head()

Unnamed: 0,cid,content,followup,i_answer,s_answer,subject,time,type
0,5851,"Hi everyone, I hope you are enjoying winter br...",I completely agree with this post. Lately the ...,,,Piazza Etiquette,2017-12-31 08:52:02,0
1,5821,**Help the EECS Department & Win Amazon Gift C...,,,,12.22.2017 Undergraduate Announcements,2017-12-22 17:42:33,1
2,5790,**The CS side did such a nice job of breaking ...,,,,EE/EECS Course Enrollment Tips,2017-12-18 20:58:02,1
3,5750,The CS Advising Office will be closed for advi...,,,,LSCS Winter Schedule,2017-12-11 17:18:29,1
4,5737,"For the CS major, students may transfer up to ...",Is this for EECS upper divs? Or does it also l...,,,new policy regarding transferring study abroad...,2017-12-11 16:39:53,1


In [164]:
df.sort_values('cid', axis=0, ascending=False)

Unnamed: 0,cid,content,followup,i_answer,s_answer,subject,time,type
39,5874,Will CS 169 be offered in Spring 2018?,,,,CS 169 in Spring 2018,2017-12-31 09:27:09,0
40,5873,How can I view my discussion waitlist position...,,,,Unable to view discussion waitlist position on...,2017-12-31 05:24:28,0
41,5872,I'm a CS major and I intend to take EE126 and ...,Thank you so much professor! I was originally ...,126 needs multiple integrals and coordinate ch...,,Pre-requisites for EE126/EE127?,2017-12-31 03:03:45,0
42,5871,When I tried to change the discussion session ...,,,,Enrollment bug when changing discussion time?,2017-12-31 01:28:23,0
43,5870,"I'm looking into doing majors in LSCS, Stats, ...",,,,How does overlapping courses between majors wo...,2017-12-30 23:02:52,0
44,5868,I'm trying to enroll in a CS class with a time...,,,,CS class conflict,2017-12-30 14:14:19,0
45,5867,Can someone give some more information on CS 1...,,,How interesting the class is somewhat depends ...,CS 174,2017-12-30 10:26:06,0
46,5866,Can I use CS 100 to fulfill one of the upper d...,,,@3989\.,CS 100,2017-12-30 04:19:51,0
47,5865,Hopefully this can offer us some insight into ...,I’m all for Piazza solution but this poll is b...,,,Which resources setup help you the most as the...,2017-12-30 04:30:24,0
48,5864,I just tried to swap my 16b lab to a different...,,,,16b swapping issue,2017-12-29 23:49:55,0


In [165]:
df.to_pickle('all_content_dataframe.pkl')