# Automatic tag a question using stackoverflow data

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import pandas as pd
import pickle
from utils import *

In [3]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas(desc="Preprocess Data")

In [4]:
import json
from raw_data import TokensRetriever, JsonConverter

file_train_raw= "data/pyast/python100k_train.json" 
file_eval_raw= "data/pyast/python50k_eval.json" 
file_non_terminals= "data/pyast/non_terminals.json" 
file_terminals= "data/pyast/terminals.json" 
file_train_converted= "data/pyast/programs_training_seq.json" 
file_eval_converted= "data/pyast/programs_eval_seq.json" 
file_train= "data/pyast/file_train.json" 
file_eval= "data/pyast/file_eval.json"
ENCODING = 'ISO-8859-1'
LIM = 100000

## Data

In [2]:
questions = pd.read_csv("data/Questions.csv", encoding='iso-8859-1')
answers = pd.read_csv("data/Answers.csv", encoding='iso-8859-1')
tags = pd.read_csv("data/Tags.csv", encoding='iso-8859-1')

In [9]:
tags.head()

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


In [3]:
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."


In [4]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


## Preprocessing clean html

In [6]:
def clean_html(html_content):
    """
    Clean html form of the data
    Argument:
        html_content -- Blog's content in html form
    
    Returns:
        clean_text -- python string containing the blog's
        content cleaned and parsed with the beatifulsoup html.parser method
    """
    
    clean_text = None
    soup = BeautifulSoup(html_content, "html.parser")
    clean_text = soup.get_text()
    return clean_text

In [7]:
questions.Title = questions["Title"].progress_apply(clean_html)
questions.Body = questions["Body"].progress_apply(clean_html)

HBox(children=(FloatProgress(value=0.0, description='Preprocess Data', max=607282.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Preprocess Data', max=607282.0, style=ProgressStyle(descr…




In [8]:
answers.Body = answers["Body"].progress_apply(clean_html)

HBox(children=(FloatProgress(value=0.0, description='Preprocess Data', max=987122.0, style=ProgressStyle(descr…




In [9]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...


In [10]:
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,open up a terminal (Applications->Utilities->T...
1,518,153.0,2008-08-02T17:42:28Z,469,2,I haven't been able to find anything that does...
2,536,161.0,2008-08-02T18:49:07Z,502,9,You can use ImageMagick's convert utility for ...
3,538,156.0,2008-08-02T18:56:56Z,535,23,One possibility is Hudson. It's written in Ja...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"We run Buildbot - Trac at work, I haven't used..."


## Save the data

In [15]:
pickle.dump(questions, open("data/questions_preprocess.pickle", "wb"))
pickle.dump(answers, open("data/answers_preprocess.pickle", "wb"))
pickle.dump(tags, open("data/tags.pickle", "wb"))

## Load Data

In [5]:
questions = pickle.load(open("data/questions_preprocess.pickle", "rb"))
answers = pickle.load(open("data/answers_preprocess.pickle", "rb"))
tags = pickle.load( open("data/tags.pickle", "rb"))

In [6]:
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...


In [7]:
answers.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,497,50.0,2008-08-02T16:56:53Z,469,4,open up a terminal (Applications->Utilities->T...
1,518,153.0,2008-08-02T17:42:28Z,469,2,I haven't been able to find anything that does...
2,536,161.0,2008-08-02T18:49:07Z,502,9,You can use ImageMagick's convert utility for ...
3,538,156.0,2008-08-02T18:56:56Z,535,23,One possibility is Hudson. It's written in Ja...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"We run Buildbot - Trac at work, I haven't used..."


In [8]:
tags.head()

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


## Relate questions with tags

In [None]:
def relate_question_tag()

## Use spacy for preprocess data to train model

# Data For Code completion

## Download python data

In [17]:
!./download_data.sh

downloading data
--2020-06-27 13:55:36--  http://files.srl.inf.ethz.ch/data/py150.tar.gz
Resolving files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)... 129.132.85.35
Connecting to files.srl.inf.ethz.ch (files.srl.inf.ethz.ch)|129.132.85.35|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.sri.inf.ethz.ch/data/py150.tar.gz [following]
--2020-06-27 13:55:36--  https://files.sri.inf.ethz.ch/data/py150.tar.gz
Resolving files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)... 129.132.85.35
Connecting to files.sri.inf.ethz.ch (files.sri.inf.ethz.ch)|129.132.85.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526642289 (502M) [application/x-gzip]
Saving to: ‘py150.tar.gz’


2020-06-27 14:16:31 (410 KB/s) - ‘py150.tar.gz’ saved [526642289/526642289]



## Retreive Tokens

In [24]:
TokensRetriever().get_and_write_tokens(
        dataset=file_train_raw,
        non_terminal_dest=file_non_terminals,
        terminal_dest=file_terminals,
        encoding=ENCODING,
        append_eof=True,
        lim=LIM
    )

100%|█████████▉| 99999/100000 [02:13<00:00, 750.41it/s] 


In [44]:
JsonConverter.convert_file(
    raw_file=file_train_raw,
    dest_file=file_train_converted,
    terminals_file=file_terminals,
    encoding=ENCODING,
    append_eof=True,
    lim=LIM,
    last_is_zero=False,
    name='Train:'
)

JsonConverter.convert_file(
    raw_file=file_eval_raw,
    dest_file=file_eval_converted,
    terminals_file=file_terminals,
    encoding=ENCODING,
    append_eof=True,
    lim=LIM,
    last_is_zero=False,
    name='Val:'
)

Train:: 100%|█████████▉| 99999/100000 [03:41<00:00, 451.51it/s]
Val::  50%|█████     | 50000/100000 [01:59<01:59, 417.10it/s]
