In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/Question\ Tagging

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Question Tagging


In [2]:
import pandas as pd
import pickle
import bs4 as bs
import re

K = 20

In [3]:
questions = pd.read_csv('StackSample/Questions.csv', encoding='latin_1')
print('Shape -', questions.shape)
questions.head(2)

Shape - (1264216, 7)


Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...


In [4]:
tags = pd.read_csv('StackSample/Tags.csv')
print('Shape -', tags.shape)
tags.head(2)

Shape - (3750994, 2)


Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3


###Merge Datasets on Top K tags

In [5]:
tags['Tag'] = tags['Tag'].apply(lambda x: re.sub('-', ' ', str(x)))
tags.head(2)

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript 3


In [6]:
top_k_tags = tags['Tag'].value_counts()[:K].index.to_list()
top_k_tags

['javascript',
 'java',
 'c#',
 'php',
 'android',
 'jquery',
 'python',
 'html',
 'c++',
 'ios',
 'mysql',
 'css',
 'sql',
 'asp.net',
 'objective c',
 'ruby on rails',
 '.net',
 'c',
 'iphone',
 'angularjs']

In [7]:
tags = tags[tags['Tag'].isin(top_k_tags)]
print(tags.shape)
tags.head(2)

(1119155, 2)


Unnamed: 0,Id,Tag
7,120,sql
8,120,asp.net


In [8]:
tags = tags.groupby('Id').apply(lambda x: x['Tag'].values).reset_index(name='Tags')
tags.head(2)

Unnamed: 0,Id,Tags
0,120,"[sql, asp.net]"
1,260,"[c#, .net]"


In [9]:
tags['Tags'] = tags['Tags'].apply(lambda x: ','.join(x))
tags.head(2)

Unnamed: 0,Id,Tags
0,120,"sql,asp.net"
1,260,"c#,.net"


In [10]:
dataset = pd.merge(questions, tags, on=['Id'], how='right')
del questions, tags
dataset.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tags
0,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"sql,asp.net"
1,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"c#,.net"
2,330,63.0,2008-08-02T02:51:36Z,,29,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,c++
3,470,71.0,2008-08-02T15:11:47Z,2016-03-26T05:23:29Z,13,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,.net
4,650,143.0,2008-08-03T11:12:52Z,,79,Automatically update version number,<p>I would like the version property of my app...,c#


In [11]:
dataset = dataset.loc[:, ('Title', 'Body', 'Tags')]
dataset.head()

Unnamed: 0,Title,Body,Tags
0,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"sql,asp.net"
1,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"c#,.net"
2,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,c++
3,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,.net
4,Automatically update version number,<p>I would like the version property of my app...,c#


In [12]:
dataset.iloc[0]['Title'], dataset.iloc[0]['Body']

('ASP.NET Site Maps',
 "<p>Has anyone got experience creating <strong>SQL-based ASP.NET</strong> site-map providers?</p>\n\n<p>I've got the default XML file <code>web.sitemap</code> working properly with my Menu and <strong>SiteMapPath</strong> controls, but I'll need a way for the users of my site to create and modify pages dynamically.</p>\n\n<p>I need to tie page viewing permissions into the standard <code>ASP.NET</code> membership system as well.</p>\n")

In [13]:
def parse_html(col):
    soup = bs.BeautifulSoup(col,'lxml').body
    text = re.sub('<.+?>', ' ', str(soup))
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}', ' ', text)
    text = text.strip()
    return text

In [14]:
dataset['parsed_body'] = dataset['Body'].apply(parse_html)

In [15]:
dataset.head()

Unnamed: 0,Title,Body,Tags,parsed_body
0,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"sql,asp.net",Has anyone got experience creating SQL-based A...
1,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"c#,.net",I have a little game written in C#. It uses a ...
2,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,c++,I am working on a collection of classes used f...
3,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,.net,I've been writing a few web services for a .ne...
4,Automatically update version number,<p>I would like the version property of my app...,c#,I would like the version property of my applic...


In [16]:
dataset['new_body'] = dataset['Title'] + ' ' + dataset['parsed_body']
dataset.head()

Unnamed: 0,Title,Body,Tags,parsed_body,new_body
0,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"sql,asp.net",Has anyone got experience creating SQL-based A...,ASP.NET Site Maps Has anyone got experience cr...
1,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"c#,.net",I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...
2,Should I use nested classes in this case?,<p>I am working on a collection of classes use...,c++,I am working on a collection of classes used f...,Should I use nested classes in this case? I am...
3,Homegrown consumption of web services,<p>I've been writing a few web services for a ...,.net,I've been writing a few web services for a .ne...,Homegrown consumption of web services I've bee...
4,Automatically update version number,<p>I would like the version property of my app...,c#,I would like the version property of my applic...,Automatically update version number I would li...


In [17]:
dataset = dataset.loc[:, ('new_body', 'Tags')]
dataset.columns = ['Body', 'Tags']
dataset.head()

Unnamed: 0,Body,Tags
0,ASP.NET Site Maps Has anyone got experience cr...,"sql,asp.net"
1,Adding scripting functionality to .NET applica...,"c#,.net"
2,Should I use nested classes in this case? I am...,c++
3,Homegrown consumption of web services I've bee...,.net
4,Automatically update version number I would li...,c#


In [18]:
dataset['Body'] = dataset['Body'].str.lower()
dataset.head()

Unnamed: 0,Body,Tags
0,asp.net site maps has anyone got experience cr...,"sql,asp.net"
1,adding scripting functionality to .net applica...,"c#,.net"
2,should i use nested classes in this case? i am...,c++
3,homegrown consumption of web services i've bee...,.net
4,automatically update version number i would li...,c#


In [19]:
dataset.to_csv('processed_data.csv', index=False)
with open('top_k_tags.pkl', 'wb') as f:
    pickle.dump(top_k_tags, f)