In [1]:
import pandas as pd
import spacy

In [2]:
# !python -m spacy download en_core_web_sm

# Load data

In [3]:
df_posts = pd.read_csv("../data/raw/computerscience_posts.csv")

df_posts['title'] = df_posts['title'].fillna('')
df_posts['body'] = df_posts['body'].fillna('')

df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,357,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,401,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s2qf5f,This book demonstrates an infinite loop in a p...,979,0.97,computerscience,https://i.redd.it/jx92aw75udb81.jpg,27,,1642048000.0
3,s36y35,Novel view tennis from single camera input,14,0.95,computerscience,https://v.redd.it/v6xlgqq17ib81,0,,1642101000.0
4,s39uv3,CS teachers/students: Cons and pros of CS onli...,1,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,0,"I mean, aside from everyone wearing their paja...",1642109000.0


# NER

In [4]:
ner = spacy.load('en_core_web_sm')

In [5]:
# Load NER results into a dataframe
def load_ner(doc, post_id, df=None, comment=None):
    if df is None:
        df = pd.DataFrame(columns=['post_id', 'text', 'start', 'end', 'label', 'description'])
    for ent in doc.ents:
        if comment:
            df = df.append({'post_id':post_id, 'comment':comment, 'text':ent.text, 'start':ent.start, 'end':ent.end, 'label':ent.label_, 'description':str(spacy.explain(ent.label_))}, ignore_index=True)
        else:
            df = df.append({'post_id':post_id, 'text':ent.text, 'start':ent.start, 'end':ent.end, 'label':ent.label_, 'description':str(spacy.explain(ent.label_))}, ignore_index=True)
    return df


In [6]:
# Helper function for iterating through each text of the dataframe
def generate_ner(df, column, comment=False):
    if comment:
        ner_df = pd.DataFrame(columns=['post_id', 'comment', 'text', 'start', 'end', 'label', 'description'])
    else:
        ner_df = pd.DataFrame(columns=['post_id', 'text', 'start', 'end', 'label', 'description'])
    
    for i in range(len(df)):
        text = df.iloc[i][column]
        post_id = df.iloc[i]['post_id']
        if comment:
            ner_df = load_ner(ner(text), post_id, ner_df, text)
        else:
            ner_df = load_ner(ner(text), post_id, ner_df)
    return ner_df

In [7]:
# Sample NER
load_ner(ner("Apple is looking at buying U.K. startup for $1 billion"), 0)

Unnamed: 0,post_id,text,start,end,label,description
0,0,Apple,0,1,ORG,"Companies, agencies, institutions, etc."
1,0,U.K.,5,6,GPE,"Countries, cities, states"
2,0,$1 billion,8,11,MONEY,"Monetary values, including unit"


### NER on title of posts

In [8]:
ner_df = generate_ner(df_posts, 'title')
ner_df.head()

Unnamed: 0,post_id,text,start,end,label,description
0,s2ar5c,8Bit,5,6,DATE,Absolute or relative dates or periods
1,s2ccci,Combatting Human-Operated Ransomware,3,8,PERSON,"People, including fictional"
2,s2ccci,1,10,11,CARDINAL,Numerals that do not fall under another type
3,s28p5d,Love,0,1,WORK_OF_ART,"Titles of books, songs, etc."
4,s28p5d,Michał Ślaski,5,7,PERSON,"People, including fictional"


In [9]:
df_posts_title_ner = df_posts.merge(ner_df, left_on='post_id', right_on='post_id', how='left')
df_posts_title_ner

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,text,start,end,label,description
0,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,,,,,
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,401,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1.634619e+09,,,,,
2,s2qf5f,This book demonstrates an infinite loop in a p...,979,0.97,computerscience,https://i.redd.it/jx92aw75udb81.jpg,27,,1.642048e+09,,,,,
3,s36y35,Novel view tennis from single camera input,14,0.95,computerscience,https://v.redd.it/v6xlgqq17ib81,0,,1.642101e+09,,,,,
4,s39uv3,CS teachers/students: Cons and pros of CS onli...,1,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,0,"I mean, aside from everyone wearing their paja...",1.642109e+09,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,rm2a30,How is long distance network communication done?,1,0.60,computerscience,https://www.reddit.com/r/computerscience/comme...,3,I just started learning networking and I was ...,1.640168e+09,,,,,
111,rlgtu5,Concept of Delayed abstraction,2,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,0,"Hi,\n\nI am trying to understand the concept o...",1.640100e+09,,,,,
112,rkr2j4,Andrew Tannenbaum,71,0.92,computerscience,https://www.reddit.com/r/computerscience/comme...,30,"Hello, I have gone through Tanenbaum's books o...",1.640018e+09,Andrew Tannenbaum,0,2,PERSON,"People, including fictional"
113,rkf6jh,I really want to design a single board compute...,26,0.89,computerscience,https://www.reddit.com/r/computerscience/comme...,17,I am aware this is not an easy task but I don’...,1.639976e+09,,,,,


### NER on body of posts

In [10]:
ner_df = generate_ner(df_posts, 'body')
ner_df.head()

Unnamed: 0,post_id,text,start,end,label,description
0,n2n0ax,500,8,9,CARDINAL,Numerals that do not fall under another type
1,qb4bof,Mac Book,43,45,PERSON,"People, including fictional"
2,qb4bof,Computer Science,60,62,ORG,"Companies, agencies, institutions, etc."
3,s39uv3,Maths,23,24,PRODUCT,"Objects, vehicles, foods, etc. (not services)"
4,s3241k,ROM,46,47,ORG,"Companies, agencies, institutions, etc."


In [11]:
df_posts_body_ner = df_posts.merge(ner_df, left_on='post_id', right_on='post_id', how='left')
df_posts_body_ner

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,text,start,end,label,description
0,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,500,8,9,CARDINAL,Numerals that do not fall under another type
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,401,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1.634619e+09,Mac Book,43,45,PERSON,"People, including fictional"
2,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,401,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1.634619e+09,Computer Science,60,62,ORG,"Companies, agencies, institutions, etc."
3,s2qf5f,This book demonstrates an infinite loop in a p...,979,0.97,computerscience,https://i.redd.it/jx92aw75udb81.jpg,27,,1.642048e+09,,,,,
4,s36y35,Novel view tennis from single camera input,14,0.95,computerscience,https://v.redd.it/v6xlgqq17ib81,0,,1.642101e+09,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,rkr2j4,Andrew Tannenbaum,71,0.92,computerscience,https://www.reddit.com/r/computerscience/comme...,30,"Hello, I have gone through Tanenbaum's books o...",1.640018e+09,MIT,47,48,ORG,"Companies, agencies, institutions, etc."
430,rkr2j4,Andrew Tannenbaum,71,0.92,computerscience,https://www.reddit.com/r/computerscience/comme...,30,"Hello, I have gone through Tanenbaum's books o...",1.640018e+09,Berkeley,49,50,ORG,"Companies, agencies, institutions, etc."
431,rkr2j4,Andrew Tannenbaum,71,0.92,computerscience,https://www.reddit.com/r/computerscience/comme...,30,"Hello, I have gone through Tanenbaum's books o...",1.640018e+09,Structured Computer Organization,77,80,ORG,"Companies, agencies, institutions, etc."
432,rkf6jh,I really want to design a single board compute...,26,0.89,computerscience,https://www.reddit.com/r/computerscience/comme...,17,I am aware this is not an easy task but I don’...,1.639976e+09,’m,62,63,CARDINAL,Numerals that do not fall under another type


### NER on comments

In [12]:
df_comments = pd.read_csv("../data/raw/computerscience_comments.csv")
df_comments.head()

Unnamed: 0,post_id,comment
0,n2n0ax,How late is too late to start a career in prog...
1,n2n0ax,I am a freshman at a university and haven't be...
2,n2n0ax,I'm still in highschool but really interested ...
3,n2n0ax,"This is probably a common question, but how we..."
4,n2n0ax,I am planning on starting a CS major this fall...


In [13]:
df_merge = df_posts.merge(df_comments, how='outer', left_on='post_id', right_on='post_id')

df_merge['comment'] = df_merge['comment'].fillna('')

df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,comment
0,n2n0ax,New to programming or computer science? Want a...,357,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,How late is too late to start a career in prog...
1,n2n0ax,New to programming or computer science? Want a...,357,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,I am a freshman at a university and haven't be...
2,n2n0ax,New to programming or computer science? Want a...,357,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,I'm still in highschool but really interested ...
3,n2n0ax,New to programming or computer science? Want a...,357,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,"This is probably a common question, but how we..."
4,n2n0ax,New to programming or computer science? Want a...,357,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1619890000.0,I am planning on starting a CS major this fall...


In [14]:
ner_df = generate_ner(df_merge, 'comment', comment=True)
ner_df.head()

Unnamed: 0,post_id,comment,text,start,end,label,description
0,n2n0ax,How late is too late to start a career in prog...,40,15,16,DATE,Absolute or relative dates or periods
1,n2n0ax,I am a freshman at a university and haven't be...,gpa,46,47,ORG,"Companies, agencies, institutions, etc."
2,n2n0ax,I am a freshman at a university and haven't be...,gpa,64,65,ORG,"Companies, agencies, institutions, etc."
3,n2n0ax,I'm still in highschool but really interested ...,AKA,99,100,ORG,"Companies, agencies, institutions, etc."
4,n2n0ax,"This is probably a common question, but how we...",nearly a decade,26,29,DATE,Absolute or relative dates or periods


In [15]:
df_posts_comment_ner = df_merge.merge(ner_df, left_on=['post_id', 'comment'], right_on=['post_id', 'comment'], how='left')
df_posts_comment_ner

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,comment,text,start,end,label,description
0,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,How late is too late to start a career in prog...,40,15,16,DATE,Absolute or relative dates or periods
1,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,I am a freshman at a university and haven't be...,gpa,46,47,ORG,"Companies, agencies, institutions, etc."
2,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,I am a freshman at a university and haven't be...,gpa,64,65,ORG,"Companies, agencies, institutions, etc."
3,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,I'm still in highschool but really interested ...,AKA,99,100,ORG,"Companies, agencies, institutions, etc."
4,n2n0ax,New to programming or computer science? Want a...,357,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1019,The previous thread was finally archived with ...,1.619890e+09,"This is probably a common question, but how we...",nearly a decade,26,29,DATE,Absolute or relative dates or periods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,rkr18d,Looking for textbook recommendations for algor...,2,0.75,computerscience,https://www.reddit.com/r/computerscience/comme...,3,Any recommendations? Looking for a book that w...,1.640017e+09,Do you mean correctness proofs? If so I can re...,Software Foundations,11,13,PERSON,"People, including fictional"
1763,rkr18d,Looking for textbook recommendations for algor...,2,0.75,computerscience,https://www.reddit.com/r/computerscience/comme...,3,Any recommendations? Looking for a book that w...,1.640017e+09,Do you mean correctness proofs? If so I can re...,3,14,15,CARDINAL,Numerals that do not fall under another type
1764,rkr18d,Looking for textbook recommendations for algor...,2,0.75,computerscience,https://www.reddit.com/r/computerscience/comme...,3,Any recommendations? Looking for a book that w...,1.640017e+09,Do you mean correctness proofs? If so I can re...,Verified Functional Algorithms,16,19,WORK_OF_ART,"Titles of books, songs, etc."
1765,rkr18d,Looking for textbook recommendations for algor...,2,0.75,computerscience,https://www.reddit.com/r/computerscience/comme...,3,Any recommendations? Looking for a book that w...,1.640017e+09,Do you mean correctness proofs? If so I can re...,first,36,37,ORDINAL,"""first"", ""second"", etc."
