### Import libraries

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("lda").setLevel(logging.WARNING)
try:
  nltk.download('stopwords')
except:
  pass
try:
  nltk.download('punkt')
except:
  pass

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utkarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utkarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### Import stopwords

In [2]:
stopwords = stopwords.words('english')

## Reading Questions.csv

In [43]:
df_questions = pd.read_csv("Questions.csv",usecols=['Id', 'CreationDate', 'Score', 'Title','Body'], encoding='latin1')
df_questions = df_questions.dropna()
df_questions.head()

Unnamed: 0,Id,CreationDate,Score,Title,Body
0,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2008-08-01T18:42:19Z,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


## Reading Answers.csv

In [5]:
df_answers = pd.read_csv("Answers.csv",usecols = ['ParentId','Score','Body'], encoding='latin1')
df_answers.columns = ['Id','Answer_Score','Answer_Body']
df_answers

Unnamed: 0,Id,Answer_Score,Answer_Body
0,90,13,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,80,12,<p>I wound up using this. It is a kind of a ha...
2,180,1,<p>I've read somewhere the human eye can't dis...
3,260,4,"<p>Yes, I thought about that, but I soon figur..."
4,260,28,"<p><a href=""http://www.codeproject.com/Article..."
...,...,...,...
2014511,40143190,0,"<p>Tanks to <a href=""http://stackoverflow.com/..."
2014512,40137110,1,<h1>tl;dr</h1>\n\n<pre><code>ZonedDateTime.par...
2014513,40141860,0,<p>I came up with a very dirty workaround. Bef...
2014514,40077010,0,<p>I solved my own problem defining the follow...


## Reading Tags.csv

In [50]:
df_tags = pd.read_csv("Tags.csv" ,encoding='latin1')
df_tags

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
...,...,...
3750989,40143360,javascript
3750990,40143360,vue.js
3750991,40143380,npm
3750992,40143380,mocha


## Preprocessing Datasets

### 1. Define functions

#####  Normalize Text: normalize_text() to remove tags of html and codes etc

In [9]:
def normalize_text(text):
    tm1 = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
    tm2 = re.sub('<code>.*?</code>', '', tm1, flags=re.DOTALL)
    tm3 = re.sub('<[^>]+>', '', tm2, flags=re.DOTALL)
    x = tm3.replace("\n", "")
    return x

#### Remove Stopwords and Punctuations: remove_stopwords_punctuations() to remove them

In [18]:
def remove_stopwords_punctuations(text):
    #REmove punctuations ~30 seconds
    text = text.replace(r'[^\w\s]+', '')
    # remove stopwords ~4 minutes
    text = ' '.join([word for word in text.split() if word not in (stopwords)])
    return text    

#### Concatenate multiple tags: same question having multiple tags is handled

In [19]:
def concatenate_multiple_tags(df):
    df = df.groupby('Id', as_index=False).agg({'Tag' : ','.join})
    return df

### 2. Application of preprocessing functions on datasets

#### on Questions.csv

In [20]:
df_questions['Body'] = df_questions['Body'].apply(lambda x: normalize_text(x))
df_questions['Body'] = df_questions['Body'].apply(lambda x: remove_stopwords_punctuations(x))
df_questions

Unnamed: 0,Id,CreationDate,Score,Title,Body
0,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...
1,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,Are really good tutorials explaining branching...
2,120,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,Has anyone got experience creating SQL-based A...
3,180,2008-08-01T18:42:19Z,53,Function for creating color wheels,This something I've pseudo-solved many times n...
4,260,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,I little game written C#. It uses database bac...
...,...,...,...,...,...
1264211,40143210,2016-10-19T23:38:01Z,0,URL routing in PHP (MVC),I building custom MVC project I base folder co...
1264212,40143300,2016-10-19T23:48:09Z,0,Bigquery.Jobs.Insert - Resumable Upload?,The API docs show able make insert Jobs resuma...
1264213,40143340,2016-10-19T23:52:50Z,1,Obfuscating code in android studio,Under minifyEnabled I changed false true. Then...
1264214,40143360,2016-10-19T23:55:24Z,0,How to fire function after v-model change?,I input I use filter array objects Vue. I'm us...


#### on Answers.csv

In [22]:
df_answers['Answer_Body'] = df_answers['Answer_Body'].apply(lambda x: normalize_text(x))
df_answers['Answer_Body'] = df_answers['Answer_Body'].apply(lambda x: remove_stopwords_punctuations(x))
df_answers

Unnamed: 0,Id,Answer_Score,Answer_Body
0,90,13,Version Control Subversion A good resource sou...
1,80,12,"I wound using this. It kind hack, actually wor..."
2,180,1,I've read somewhere human eye can't distinguis...
3,260,4,"Yes, I thought that, I soon figured another Do..."
4,260,28,Oleg Shilo's C# Script solution (at The Code P...
...,...,...,...
2014511,40143190,0,Tanks SO answer I found answer myself:
2014512,40137110,1,tl;dr 19/10/16java.timeYou using troublesome o...
2014513,40141860,0,I came dirty workaround. Before bootstraping m...
2014514,40077010,0,I solved problem defining following function. ...


#### on Tags.csv

In [54]:
df_tags['Tag'] = df_tags['Tag'].apply(lambda x: str(x))
df_tags = concatenate_multiple_tags(df_tags)
df_tags

Unnamed: 0,Id,Tag
0,80,"flex,actionscript-3,air"
1,90,"svn,tortoisesvn,branch,branching-and-merging"
2,120,"sql,asp.net,sitemap"
3,180,"algorithm,language-agnostic,colors,color-space"
4,260,"c#,.net,scripting,compiler-construction"
...,...,...
1264211,40143210,"php,.htaccess"
1264212,40143300,google-bigquery
1264213,40143340,"android,android-studio"
1264214,40143360,"javascript,vue.js"


#### Merging dataframes

In [35]:
merged = pd.merge(df_questions, df_answers, how='inner', on="Id")

In [36]:
merged = pd.merge(merged, df_tags, how='inner', on="Id")

In [37]:
merged

Unnamed: 0,Id,CreationDate,Score,Title,Body,Answer_Score,Answer_Body,Tag
0,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...,12,"I wound using this. It kind hack, actually wor...","flex,actionscript-3,air"
1,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...,6,The SQLite API function called something like ...,"flex,actionscript-3,air"
2,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...,1,What making delimiter something little complex...,"flex,actionscript-3,air"
3,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,Are really good tutorials explaining branching...,13,Version Control Subversion A good resource sou...,"svn,tortoisesvn,branch,branching-and-merging"
4,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,Are really good tutorials explaining branching...,2,You also try Version Control Standalone Progra...,"svn,tortoisesvn,branch,branching-and-merging"
...,...,...,...,...,...,...,...,...
2014511,40142910,2016-10-19T23:05:49Z,0,Validation for must_be_below_user_limit allowi...,So I Am building multi-tenant app Rails 4 Apar...,0,Try add method.,"ruby-on-rails,validation,ruby-on-rails-4"
2014512,40142940,2016-10-19T23:08:42Z,-1,Drawing an iscosceles triangle of asteriks on C++,I learning c++ I'm trying draw iscosceles tria...,0,Here's it:Output:The basic idea switch mapping...,c++
2014513,40142940,2016-10-19T23:08:42Z,-1,Drawing an iscosceles triangle of asteriks on C++,I learning c++ I'm trying draw iscosceles tria...,1,Alternative @space_voyager code support dynami...,c++
2014514,40143190,2016-10-19T23:36:01Z,1,How to execute multiline python code from a ba...,I need extend shell script (bash). As I much f...,5,Use here-doc:,"python,bash,multiline"


In [38]:
merged = merged.astype("string")
merged[['Body', 'Tag']].fillna("", inplace = True)
# merged

Unnamed: 0,Id,CreationDate,Score,Title,Body,Answer_Score,Answer_Body,Tag
0,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...,12,"I wound using this. It kind hack, actually wor...","flex,actionscript-3,air"
1,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...,6,The SQLite API function called something like ...,"flex,actionscript-3,air"
2,80,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,I've written database generation script SQL wa...,1,What making delimiter something little complex...,"flex,actionscript-3,air"
3,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,Are really good tutorials explaining branching...,13,Version Control Subversion A good resource sou...,"svn,tortoisesvn,branch,branching-and-merging"
4,90,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,Are really good tutorials explaining branching...,2,You also try Version Control Standalone Progra...,"svn,tortoisesvn,branch,branching-and-merging"
...,...,...,...,...,...,...,...,...
2014511,40142910,2016-10-19T23:05:49Z,0,Validation for must_be_below_user_limit allowi...,So I Am building multi-tenant app Rails 4 Apar...,0,Try add method.,"ruby-on-rails,validation,ruby-on-rails-4"
2014512,40142940,2016-10-19T23:08:42Z,-1,Drawing an iscosceles triangle of asteriks on C++,I learning c++ I'm trying draw iscosceles tria...,0,Here's it:Output:The basic idea switch mapping...,c++
2014513,40142940,2016-10-19T23:08:42Z,-1,Drawing an iscosceles triangle of asteriks on C++,I learning c++ I'm trying draw iscosceles tria...,1,Alternative @space_voyager code support dynami...,c++
2014514,40143190,2016-10-19T23:36:01Z,1,How to execute multiline python code from a ba...,I need extend shell script (bash). As I much f...,5,Use here-doc:,"python,bash,multiline"


In [42]:
merged.to_csv("questions_answers_tags.csv", index = False, encoding = 'latin1')