In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

import re

# Import Dataset

In [2]:
path = "../data/raw/td_V2.db"
con = sqlite3.connect(path)

jira_query = """SELECT *
     FROM jira_issues"""

jira = pd.read_sql(jira_query, con)
description = jira['DESCRIPTION']


# Preprocessing

## Unifying style

Convert all text to lowercase

In [3]:
description_1 = description.apply( lambda txt: txt.lower() )

Remove symbols:
- Remove special characters (\r and \n)
- Remove text in {} and []
- Special words (VERSION, URL, PATH, FILE, DATE)
- Remove punctuation
- Replace multiple spaces for a single space

In [4]:
brackets_re = r'\[+(.*?)\]+|\{+(.*?)\}+'
URL_re = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
PATH_re = r'(\/)*[\w\-\_.\<\>]+(\/[\w\-\_.\<\>\=]+)+'
version_re = r'\d+(\.\d+)+'
file_re = r'[\w\-\_]+(\.[\w\-\_\=]+)+'
date_re = r'(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})'
punctuation_re = r'[^\w\s]'
multiplespace_re = r'  +'

description_2 = description_1.apply( lambda txt: txt.replace('\r', ' '))
description_2 = description_2.apply( lambda txt: txt.replace('\n', ' '))
description_2 = description_2.apply( lambda txt: re.sub(brackets_re, '', txt) )
description_2 = description_2.apply( lambda txt: re.sub(URL_re, 'URL', txt) )
description_2 = description_2.apply( lambda txt: re.sub(PATH_re, 'PATH', txt) )
description_2 = description_2.apply( lambda txt: re.sub(version_re, 'VERSION', txt) )
description_2 = description_2.apply( lambda txt: re.sub(file_re, 'FILE', txt) )
description_2 = description_2.apply( lambda txt: re.sub(date_re, 'DATE', txt) )
description_2 = description_2.apply( lambda txt: re.sub(punctuation_re, ' ', txt) )
description_2 = description_2.apply( lambda txt: re.sub(multiplespace_re, ' ', txt) )

## Stopword removal
Three steps:
1. Tokeanization
2. Stopword removal
3. Join string

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/raulhigueras/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raulhigueras/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
words = [word_tokenize(description_2[i]) for i in range(len(description_2))]

sw = stopwords.words('english')
words_wo_sw = [[word for word in txt if word not in sw] for txt in words]
description_3 = pd.Series([' '.join(txt) for txt in words_wo_sw])

## Project-specific terms removal

In [7]:
project_names = {" " + name.split(":")[-1].replace("-", " ") + " " for name in set(jira['PROJECT_ID']) }
project_names_re = r'|'.join(project_names)
description_4 = description_3.apply( lambda txt: re.sub(project_names_re, ' ', txt) )

# Some examples
Following, we show two examples of preprocessing. Each example is composed of four strings, the starting description and each one of the processing steps' output. Both examples have been chosen to show how the code works.

In [11]:
idx = 10050

print(description_1[idx], end="\n----\n")
print(description_2[idx], end="\n----\n")
print(description_3[idx], end="\n----\n")
print(description_4[idx], end="\n----\n")

there are a bunch of files who have either a plain wrong number or at least something that is not really consistent with https://thrift.apache.org/docs/committers/howtoversion 

additionally, the doap.rdf is slightly outdated.
----
there are a bunch of files who have either a plain wrong number or at least something that is not really consistent with URL additionally the FILE is slightly outdated 
----
bunch files either plain wrong number least something really consistent URL additionally FILE slightly outdated
----
bunch files either plain wrong number least something really consistent URL additionally FILE slightly outdated
----


In [12]:
idx = 9643

print(description_1[idx], end="\n----\n")
print(description_2[idx], end="\n----\n")
print(description_3[idx], end="\n----\n")
print(description_4[idx], end="\n----\n")

when using thttpclient to establish authenticated connections over http, it should be responsible for saving the cookie sent by the server in the set-cookie response header.

for example, this is useful when using thrift to establish a connection with a hive server exposing thrift over http with authentication activated (regardless of the mechanism).
----
when using thttpclient to establish authenticated connections over http it should be responsible for saving the cookie sent by the server in the set cookie response header for example this is useful when using thrift to establish a connection with a hive server exposing thrift over http with authentication activated regardless of the mechanism 
----
using thttpclient establish authenticated connections http responsible saving cookie sent server set cookie response header example useful using thrift establish connection hive server exposing thrift http authentication activated regardless mechanism
----
using thttpclient establish authe

# Next Step: Modeling
Two different sections.
1. **Feature extraction:** word embeddings, tf-idf, etc.
2. **Model design:** clustering, classificators, etc.