In [55]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from matplotlib import pyplot as plt 
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation
from sklearn.metrics import classification_report, confusion_matrix

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Sequential
import warnings
warnings.filterwarnings('ignore')

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('wordnet')

ModuleNotFoundError: No module named 'tensorflow'

### Loading datasets for Eclipse, Mozilla, Thunderbird and combining to form 3 different dataframes

In [56]:
# Loading all the given datasets in 6 dataframes

df1 = pd.read_csv('EP_nondup.csv',sep=';')
df2 = pd.read_csv('EP_dup.csv',sep=';')
df3 = pd.read_csv('M_Duplicate BRs.csv',sep=';')
df4 = pd.read_csv('M_NonDuplicate BRs.csv',sep=';')
df5 = pd.read_csv('dup_TB.csv',sep=';')
df6 = pd.read_csv('Nondup_TB.csv',sep=';')

frames_ecl = [df1,df2]
frames_moz = [df3,df4]
frames_tb = [df5,df6]

# Concatenating 2 datasets for each of the platforms to form 3 datasets
df_ecl = pd.concat(frames_ecl)
df_moz = pd.concat(frames_moz)
df_tb = pd.concat(frames_tb)

In [58]:
df2

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label
0,25,28126,cvs ui need vcm prefs default repo connection gc,it would be helpful if there was a notion of d...,wizards patch standard public cvs repositories,this patch adds a convenient way to check thin...,1
1,40,20,need connect to team stream gcqpkw,i would like to be able to connect to a team s...,workspace files,thought it would be useful if the set of repo ...,1
2,48,22,make sure can future store other project refer...,project references come in three flavours . p...,persist sharing recommendations and project ve...,project descriptions dont store sharing recomm...,1
3,61,60,.vcmmeta showing as change gdqtgw,useruser install drop into declipse user ...,need custom .vcmignore comparemerge gdqt,useruser install drop into declipse user ...,1
4,94,2,repositories view all file types open to the t...,when browsing files in the repositories view i...,opening repository resources doesnt honor type...,opening repository resource open the default ...,1
...,...,...,...,...,...,...,...
12681,423034,287720,.metadata.log error,please see my .log file please fix this error,eclipse crashes while startup,id .. .eclipse crashes on startup . . more...,1
12682,423852,422971,workbench classcastexception handlerprocessing...,backport into .. this bug was initially crea...,workbench classcastexception handlerprocessing...,backport to .. this bug was initially create...,1
12683,423888,413977,keybindings resizing content assist proposal p...,tested on newly installed standard kepler sr o...,keybindings all nonnative key bindings stop wo...,. and latest n. . paste the following snippet...,1
12684,424120,418254,close window from context menu,context menu eclipse.id...m java.version.. ja...,editormgmt keybindings ctrle and delete causes...,after updating my kepler eclipse .. to service...,1


In [44]:
print("Eclipse dataset shape: ",df_ecl.shape)
print("Mozilla dataset shape: ",df_moz.shape)
print("Thunderbird dataset shape: ",df_tb.shape)

Eclipse dataset shape:  (46908, 7)
Mozilla dataset shape:  (60904, 7)
Thunderbird dataset shape:  (14263, 7)


In [45]:
# Counting the unique values in label for the 3 datasets
print(df_ecl['Label'].value_counts())
print(df_moz['Label'].value_counts())
print(df_tb['Label'].value_counts())

0    34222
1    12686
Name: Label, dtype: int64
0    36833
1    24071
Name: Label, dtype: int64
0    9905
1    4358
Name: Label, dtype: int64


### Combining the text in Title and description column for all datasets in order to vectorize them together

In [46]:
df_ecl['BugInfo1'] = df_ecl['Title1'].str.cat(df_ecl['Description1'],sep=" ")
df_ecl['BugInfo2'] = df_ecl['Title2'].str.cat(df_ecl['Description2'],sep=" ")

df_moz['BugInfo1'] = df_moz['Title1'].str.cat(df_moz['Description1'],sep=" ")
df_moz['BugInfo2'] = df_moz['Title2'].str.cat(df_moz['Description2'],sep=" ")

df_tb['BugInfo1'] = df_tb['Title1'].str.cat(df_tb['Description1'],sep=" ")
df_tb['BugInfo2'] = df_tb['Title2'].str.cat(df_tb['Description2'],sep=" ")

### Remove puncctuations and stop words from the datasets to process

In [47]:
stop_words = set(stopwords.words('english'))

def words(text):
    text = ''.join([i for i in text if i not in punctuation])
    text = text.lower()
    text = text.split()
    text = [j for j in text if not j in stop_words]
    text = " ".join(text)
    return(text)

In [48]:
def preprocess(bug_reports,bugs_all):
    for n in bugs_all:
        bug_reports.append(words(n))

In [49]:
bug_pp_ecl1 = []
bug_pp_ecl2 = []

bug_pp_moz1 = []
bug_pp_moz2 = []

bug_pp_tb1 = []
bug_pp_tb2 = []

preprocess(bug_pp_ecl1, df_ecl.BugInfo1)
preprocess(bug_pp_ecl2, df_ecl.BugInfo2)

preprocess(bug_pp_moz1, df_moz.BugInfo1)
preprocess(bug_pp_moz2, df_moz.BugInfo2)

preprocess(bug_pp_tb1, df_tb.BugInfo1)
preprocess(bug_pp_tb2, df_tb.BugInfo2)

In [50]:
df1_ecl = DataFrame(bug_pp_ecl1,columns=['BugReport1_ECL'])
df2_ecl = DataFrame(bug_pp_ecl2,columns=['BugReport2_ECL'])

df1_moz = DataFrame(bug_pp_moz1,columns=['BugReport1_MOZ'])
df2_moz = DataFrame(bug_pp_moz2,columns=['BugReport2_MOZ'])

df1_tb = DataFrame(bug_pp_tb1,columns=['BugReport1_TB'])
df2_tb = DataFrame(bug_pp_tb2,columns=['BugReport2_TB'])

### Combine both the bug report 1 and bug report 2 to form a single column

In [52]:
data_ecl = pd.DataFrame()
data_ecl["BugReport12"] = df1_ecl["BugReport1_ECL"] + " " + df2_ecl["BugReport2_ECL"]

data_moz = pd.DataFrame()
data_moz["BugReport12"] = df1_moz["BugReport1_MOZ"] + " " + df2_moz["BugReport2_MOZ"]

data_tb = pd.DataFrame()
data_tb["BugReport12"] = df1_tb["BugReport1_TB"] + " " + df2_tb["BugReport2_TB"]