In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Import Corpus

In [2]:
col_types = {'Id': 'int', 
             'OwnerUserId': 'float', 
             'CreationDate': 'str', 
             'ParentId': 'int', 
             'Score': 'int',
             'Title': 'str',
             'Body':'str'}

questions = pd.read_csv('../pythonquestions/Questions.csv', encoding = "ISO-8859-1", dtype=col_types)
answers = pd.read_csv('../pythonquestions/Answers.csv', encoding = "ISO-8859-1", dtype=col_types)

In [3]:
questions.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...
5,742,189.0,2008-08-03T15:55:28Z,30,Class views in Django,"<p><a href=""http://www.djangoproject.com/"">Dja..."
6,766,1384652.0,2008-08-03T17:44:07Z,20,Python and MySQL,<p>I can get Python to work with Postgresql bu...
7,773,207.0,2008-08-03T18:27:09Z,256,How do I use Python's itertools.groupby()?,<p>I haven't been able to find an understandab...
8,972,145.0,2008-08-04T02:17:51Z,364,Adding a Method to an Existing Object Instance,<p>I've read that it is possible to add a meth...
9,1476,92.0,2008-08-04T18:20:36Z,251,How do you express binary literals in Python?,<p>How do you express an integer as a binary n...


In [4]:
answers.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."
5,595,116.0,2008-08-03T01:17:36Z,594,25,<p>The canonical way is to use the built-in cu...
6,660,197.0,2008-08-03T12:09:18Z,535,14,<p>Second the Buildbot - Trac integration. You...
7,701,111.0,2008-08-03T14:30:50Z,683,3,"<p>No, you were not dreaming. Python has a pr..."
8,735,145.0,2008-08-03T15:47:22Z,683,-2,<p>I think:</p>\r\n\r\n<pre><code>#!/bin/pytho...
9,745,154.0,2008-08-03T15:59:19Z,683,8,<p>Are you looking to get a list of objects th...


# Map Questions to Answers and Filter Questions w/ 4+ Answers

In [5]:
q_to_a = dict()
for ind, row in answers[['Id', 'ParentId', 'Score']].iterrows():
    qid = row['ParentId']
    aid = row['Id']
    a_score = row['Score']
    if qid not in q_to_a:
        q_to_a[qid] = [(aid, a_score)]
    else:
        q_to_a[qid].append((aid, a_score))

def focus(answer_list):
    if len(answer_list) < 4:
        return False
    
    for pair in answer_list:
        if pair[1] > 0:
            return True
    return False

q_to_a = {k:v for k, v in q_to_a.items() if focus(v)}

questions = questions[questions['Id'].isin(q_to_a)]

In [6]:
def answer_in_use(answer):
    if answer['ParentId'] in q_to_a:
        for item in q_to_a[answer['ParentId']]:
            if answer['Id'] == item[0]:
                return True
    return False

answers = answers[answers.apply(lambda x: answer_in_use(x), axis=1)]

In [7]:
questions['BodyText'] = questions['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
answers['BodyText'] = answers['Body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

In [8]:
questions.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body,BodyText
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,I am using the Photoshop's javascript API to f...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,I'm starting work on a hobby project with a py...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,I don't remember whether I was dreaming or not...
5,742,189.0,2008-08-03T15:55:28Z,30,Class views in Django,"<p><a href=""http://www.djangoproject.com/"">Dja...","Django view points to a function, which can be..."
6,766,1384652.0,2008-08-03T17:44:07Z,20,Python and MySQL,<p>I can get Python to work with Postgresql bu...,I can get Python to work with Postgresql but I...
7,773,207.0,2008-08-03T18:27:09Z,256,How do I use Python's itertools.groupby()?,<p>I haven't been able to find an understandab...,I haven't been able to find an understandable ...
8,972,145.0,2008-08-04T02:17:51Z,364,Adding a Method to an Existing Object Instance,<p>I've read that it is possible to add a meth...,I've read that it is possible to add a method ...
9,1476,92.0,2008-08-04T18:20:36Z,251,How do you express binary literals in Python?,<p>How do you express an integer as a binary n...,How do you express an integer as a binary numb...
10,1734,59.0,2008-08-05T00:12:55Z,20,Any experiences with Protocol Buffers?,<p>I was just looking through some information...,I was just looking through some information ab...
11,1829,30.0,2008-08-05T02:39:23Z,12,How do I make a menu that does not require the...,<p>I've got a menu in Python. That part was ea...,I've got a menu in Python. That part was easy....


In [9]:
answers.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,BodyText
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...,open up a terminal (Applications->Utilities->T...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...,I haven't been able to find anything that does...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...,One possibility is Hudson. It's written in Ja...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B...","We run Buildbot - Trac at work, I haven't used..."
6,660,197.0,2008-08-03T12:09:18Z,535,14,<p>Second the Buildbot - Trac integration. You...,Second the Buildbot - Trac integration. You ca...
7,701,111.0,2008-08-03T14:30:50Z,683,3,"<p>No, you were not dreaming. Python has a pr...","No, you were not dreaming. Python has a prett..."
8,735,145.0,2008-08-03T15:47:22Z,683,-2,<p>I think:</p>\r\n\r\n<pre><code>#!/bin/pytho...,I think:\n#!/bin/pythonbar in dict(Foo)\nIs wh...
9,745,154.0,2008-08-03T15:59:19Z,683,8,<p>Are you looking to get a list of objects th...,Are you looking to get a list of objects that ...
10,750,199.0,2008-08-03T16:13:29Z,683,2,<p>What I was thinking of can be achieved usin...,What I was thinking of can be achieved using l...
11,764,,2008-08-03T17:40:25Z,742,0,<p>Sounds to me like you're trying to combine ...,Sounds to me like you're trying to combine thi...


# Answers: Count English Words, Number of Links, and Does it Have Code?

In [10]:
def strip_code(html):
    bs = BeautifulSoup(html, 'html.parser')
    [s.extract() for s in bs('code')]
    return bs.get_text()

def count_links(html):
    bs = BeautifulSoup(html, 'html.parser')
    return len(bs.find_all('a'))

def has_code(html):
    bs = BeautifulSoup(html, 'html.parser')
    return 1 if bs.find('code') != None else 0
    
answers['BodyEnglishText'] = answers['Body'].apply(strip_code)
answers['EnglishCount'] = answers['BodyEnglishText'].apply(lambda x: len(x.split()))
answers['LinkCount'] = answers['Body'].apply(count_links)
answers['HasCode'] = answers['Body'].apply(has_code)

In [11]:
answers.head(10)

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body,BodyText,BodyEnglishText,EnglishCount,LinkCount,HasCode
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...,open up a terminal (Applications->Utilities->T...,open up a terminal (Applications->Utilities->T...,29,0,1
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...,I haven't been able to find anything that does...,I haven't been able to find anything that does...,38,0,1
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...,One possibility is Hudson. It's written in Ja...,One possibility is Hudson. It's written in Ja...,36,2,0
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B...","We run Buildbot - Trac at work, I haven't used...","We run Buildbot - Trac at work, I haven't used...",42,1,0
6,660,197.0,2008-08-03T12:09:18Z,535,14,<p>Second the Buildbot - Trac integration. You...,Second the Buildbot - Trac integration. You ca...,Second the Buildbot - Trac integration. You ca...,144,1,0
7,701,111.0,2008-08-03T14:30:50Z,683,3,"<p>No, you were not dreaming. Python has a pr...","No, you were not dreaming. Python has a prett...","No, you were not dreaming. Python has a prett...",157,1,1
8,735,145.0,2008-08-03T15:47:22Z,683,-2,<p>I think:</p>\r\n\r\n<pre><code>#!/bin/pytho...,I think:\n#!/bin/pythonbar in dict(Foo)\nIs wh...,I think:\n\nIs what you are thinking of. When...,235,2,1
9,745,154.0,2008-08-03T15:59:19Z,683,8,<p>Are you looking to get a list of objects th...,Are you looking to get a list of objects that ...,Are you looking to get a list of objects that ...,26,1,1
10,750,199.0,2008-08-03T16:13:29Z,683,2,<p>What I was thinking of can be achieved usin...,What I was thinking of can be achieved using l...,What I was thinking of can be achieved using l...,59,0,1
11,764,,2008-08-03T17:40:25Z,742,0,<p>Sounds to me like you're trying to combine ...,Sounds to me like you're trying to combine thi...,Sounds to me like you're trying to combine thi...,75,0,0
