In [None]:
#import the bq_helper library to help in making API calls to big query and fetch information
import bq_helper
from bq_helper import BigQueryHelper
import os

#we need to set the google application credentials key, which is unique based on the service account.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="key.json"
bq_assistant = BigQueryHelper("bigquery-public-data", "stackoverflow")

#we find out the questions answered by a particular user and save them as a dataframe
QUERY = "SELECT q.id, q.title, q.body, q.tags, a.body as answers, a.score FROM `bigquery-public-data.stackoverflow.posts_questions` AS q INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a ON q.id = a.parent_id WHERE q.tags LIKE '%python%' LIMIT 500000"
df = bq_assistant.query_to_pandas(QUERY)

In [3]:
#output this dataframe as a csv file
df.to_csv('data/Original_data.csv')

In [6]:
#read the stored csv file which has information as displayed below
import pandas as pd
import numpy as np
import spacy
EN = spacy.load('en_core_web_sm')
df = pd.read_csv('data/Original_data.csv')
df = df.iloc[:,1:]
df.head()

Unnamed: 0,id,title,body,tags,answers,score
0,4395499,PyLint Best Practices?,<p>pyLint looks like a good tool for running a...,python|static-analysis|pylint,<p>To persistently disable warnings and conven...,10
1,8036878,Function of Numpy Array with if-statement,"<p>I am using <a href=""http://matplotlib.sourc...",python|numpy|matplotlib,"<p>I know it is too late for this answer, but ...",12
2,12492137,Python sum of ASCII values of all characters i...,<p>I am searching a more efficient way to sum-...,python|string|python-2.7|ascii,<p>You can use an intermediate <code>bytearray...,20
3,2676133,Best way to do enum in Sqlalchemy?,<p>I'm reading about sqlalchemy and I saw foll...,python|sqlalchemy,<p>SQLAlchemy has an Enum type since 0.6: \n<a...,35
4,4857927,Swapping columns in a numpy array?,<pre><code>from numpy import *\ndef swap_colum...,python|numpy,<p>I find the following the fastest:</p>\n\n<p...,24


In [7]:
print('Datebase shape:' + str(df.shape))

Datebase shape:(500000, 6)


In [8]:
#we can see that there are no null values
df.isna().sum()

id         0
title      0
body       0
tags       0
answers    0
score      0
dtype: int64

In [10]:
pd.__version__

'1.0.3'

In order to construct a corpus, we grouped all the answers by concatenating them based on their common questions and tags. Moreover, we added the scores for each answer in order to get a collective score for an entire question

In [11]:
aggregations = {
    'answers': lambda x: "\n".join(x) ,
    'score': 'sum'
    }
grouped = df.groupby(['id','title', 'body','tags'],as_index=False).agg(aggregations)
deduped_df = pd.DataFrame(grouped)

In [12]:
deduped_df.head()

Unnamed: 0,id,title,body,tags,answers,score
0,535,Continuous Integration System for a Python Cod...,<p>I am starting to work on a hobby project wi...,python|continuous-integration|extreme-programming,<p>One possibility is Hudson. It's written in...,110
1,773,How do I use itertools.groupby()?,<p>I haven't been able to find an understandab...,python|iteration,<p>Can you show us your code?</p>\n\n<p>The ex...,847
2,972,Adding a Method to an Existing Object Instance,<p>I've read that it is possible to add a meth...,python|oop|methods|monkeypatching,<p>You can use lambda to bind a method to an i...,1148
3,1171,What is the most efficient graph data structur...,<p>I need to be able to manipulate a large (10...,python|performance|data-structures|graph-theory,"<p>Even though this question is now quite old,...",81
4,1476,How do you express binary literals in Python?,<p>How do you express an integer as a binary n...,python|syntax|binary|integer|literals,<p>I am pretty sure this is one of the things ...,369


The following code block shows the result of combining answers and their scores

In [13]:
print('Max score before: ') 
print(np.max(df.score.values))

print('Max score after: ') 
print(np.max(deduped_df.score.values))

Max score before: 
5842
Max score after: 
9163


A couple of helper functions for Text Preprocessing. The steps followed to process a piece of raw text are:

1. Convert raw text into tokens
2. Convert tokens to lower case
3. Remove punctuations
4. Remove Stopwords<br>
Note: we skipped removal of numeric data since we felt it would remove precious contextual information. we also skipped a 'Stemming/Lemmatization' step because we did not want alter the domain specific terms used in our corpus and risk losing precious information

In [15]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

def tokenize_text(text):
    "Apply tokenization using spacy to docstrings."
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def tokenize_code(text):
    "A very basic procedure for tokenizing code strings."
    return RegexpTokenizer(r'\w+').tokenize(text)

def preprocess_text(text):
    return ' '.join(normalize(tokenize_text(text)))

In [19]:
from bs4 import BeautifulSoup
from textblob import TextBlob

title_list = [] 
content_list = []
url_list = []
comment_list = []
sentiment_polarity_list = []
sentiment_subjectivity_list = []
vote_list =[]
tag_list = []
corpus_list = []

for i, row in deduped_df.iterrows():
    title_list.append(row.title)    # Get question title
    tag_list.append(row.tags)     # Get question tags
    
    # Questions
    content = row.body
    soup = BeautifulSoup(content, 'lxml')
    if soup.code: soup.code.decompose()     # Remove the code section
    tag_p = soup.p
    tag_pre = soup.pre
    text = ''
    if tag_p: text = text + tag_p.get_text()
    if tag_pre: text = text + tag_pre.get_text()
        
    content_list.append(str(row.title) + ' ' + str(text))   # Append title and question body data to the updated question body
    
    url_list.append('https://stackoverflow.com/questions/' + str(row.id))
    
    # Answers
    content = row.answers
    soup = BeautifulSoup(content, 'lxml')
    if soup.code: soup.code.decompose()
    tag_p = soup.p
    tag_pre = soup.pre
    text = ''
    if tag_p: text = text + tag_p.get_text()
    if tag_pre: text = text + tag_pre.get_text()
    comment_list.append(text)
    
    vote_list.append(row.score)       # Append votes
    
    corpus_list.append(content_list[-1] + ' ' + comment_list[-1])     # Combine the updated body and answers to make the corpus
    
    sentiment = TextBlob(row.answers).sentiment
    sentiment_polarity_list.append(sentiment.polarity)
    sentiment_subjectivity_list.append(sentiment.subjectivity)

content_token_df = pd.DataFrame({'original_title': title_list, 'post_corpus': corpus_list, 'question_content': content_list, 'question_url': url_list, 'tags': tag_list, 'overall_scores':vote_list,'answers_content': comment_list, 'sentiment_polarity': sentiment_polarity_list, 'sentiment_subjectivity':sentiment_subjectivity_list})

In [20]:
content_token_df.head()

Unnamed: 0,original_title,post_corpus,question_content,question_url,tags,overall_scores,answers_content,sentiment_polarity,sentiment_subjectivity
0,Continuous Integration System for a Python Cod...,Continuous Integration System for a Python Cod...,Continuous Integration System for a Python Cod...,https://stackoverflow.com/questions/535,python|continuous-integration|extreme-programming,110,One possibility is Hudson. It's written in Ja...,0.159901,0.487469
1,How do I use itertools.groupby()?,How do I use itertools.groupby()? I haven't be...,How do I use itertools.groupby()? I haven't be...,https://stackoverflow.com/questions/773,python|iteration,847,Can you show us your code?,-0.137932,0.737756
2,Adding a Method to an Existing Object Instance,Adding a Method to an Existing Object Instance...,Adding a Method to an Existing Object Instance...,https://stackoverflow.com/questions/972,python|oop|methods|monkeypatching,1148,You can use lambda to bind a method to an inst...,0.119248,0.405317
3,What is the most efficient graph data structur...,What is the most efficient graph data structur...,What is the most efficient graph data structur...,https://stackoverflow.com/questions/1171,python|performance|data-structures|graph-theory,81,"Even though this question is now quite old, I ...",0.179258,0.511521
4,How do you express binary literals in Python?,How do you express binary literals in Python? ...,How do you express binary literals in Python? ...,https://stackoverflow.com/questions/1476,python|syntax|binary|integer|literals,369,I am pretty sure this is one of the things due...,-0.064497,0.605969


In [21]:
content_token_df.tags = content_token_df.tags.apply(lambda x: x.split('|'))   # Convert raw text data of tags into lists

# Make a dictionary to count the frequencies for all tags
tag_freq_dict = {}
for tags in content_token_df.tags:
    for tag in tags:
        if tag not in tag_freq_dict:
            tag_freq_dict[tag] = 0
        else:
            tag_freq_dict[tag] += 1

The plan is to filter only the data which contains at least one of most_common_tags

In [22]:
import heapq
most_common_tags = heapq.nlargest(20, tag_freq_dict, key=tag_freq_dict.get)

In [23]:
most_common_tags

['python',
 'python-3.x',
 'pandas',
 'django',
 'python-2.7',
 'numpy',
 'list',
 'matplotlib',
 'dataframe',
 'dictionary',
 'regex',
 'tkinter',
 'flask',
 'string',
 'tensorflow',
 'csv',
 'arrays',
 'json',
 'beautifulsoup',
 'selenium']

In [24]:
final_indices = []
for i,tags in enumerate(content_token_df.tags.values.tolist()):
    if len(set(tags).intersection(set(most_common_tags)))>1:   # The minimum length for common tags should be 2 because 'python' is a common tag for all
        final_indices.append(i)

In [26]:
final_data = content_token_df.iloc[final_indices]

**Data Normalization**
<br>
1. we created a separate column for the 'processed_title' because we wanted to preserve the original title because we wanted to serve the original titles in the web interface
2. we also normalized the numeric 'scores'

In [None]:
import spacy
EN = spacy.load('en_core_web_sm')

# Preprocess text for 'question_body', 'post_corpus' and a new column 'processed_title'
final_data.question_content = final_data.question_content.apply(lambda x: preprocess_text(x))
final_data.post_corpus = final_data.post_corpus.apply(lambda x: preprocess_text(x))
final_data['processed_title'] = final_data.original_title.apply(lambda x: preprocess_text(x))

# Normalize numeric data for the scores
final_data.overall_scores = (final_data.overall_scores - final_data.overall_scores.mean()) / (final_data.overall_scores.max() - final_data.overall_scores.min())

In [None]:
final_data.tags = final_data.tags.apply(lambda x: '|'.join(x))    # Combine the lists back into text data
final_data.drop(['answers_content'], axis=1)     # Remove the answers_content columns because it is alreaady included in the corpus