## Group 1 - Phase 2
### (Models - Logistic Regression, Random Forest Classifier, Decision Tree Classifier)

In [25]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from matplotlib import pyplot as plt 
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

from numpy import array
import warnings
warnings.filterwarnings('ignore')

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suvijain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/suvijain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading datasets for Eclipse, Mozilla, Thunderbird and combining to form 3 different dataframes

In [26]:
# Loading all the given datasets in 6 dataframes

df1 = pd.read_csv('EP_nondup.csv',sep=';')
df2 = pd.read_csv('EP_dup.csv',sep=';')
df3 = pd.read_csv('M_Duplicate BRs.csv',sep=';')
df4 = pd.read_csv('M_NonDuplicate BRs.csv',sep=';')
df5 = pd.read_csv('dup_TB.csv',sep=';')
df6 = pd.read_csv('Nondup_TB.csv',sep=';')

frames_ecl = [df1,df2]
frames_moz = [df3,df4]
frames_tb = [df5,df6]

# Concatenating 2 datasets for each of the platforms to form 3 datasets
df_ecl = pd.concat(frames_ecl)
df_moz = pd.concat(frames_moz)
df_tb = pd.concat(frames_tb)

In [27]:
df2

Unnamed: 0,Issue_id,Duplicated_issue,Title1,Description1,Title2,Description2,Label
0,25,28126,cvs ui need vcm prefs default repo connection gc,it would be helpful if there was a notion of d...,wizards patch standard public cvs repositories,this patch adds a convenient way to check thin...,1
1,40,20,need connect to team stream gcqpkw,i would like to be able to connect to a team s...,workspace files,thought it would be useful if the set of repo ...,1
2,48,22,make sure can future store other project refer...,project references come in three flavours . p...,persist sharing recommendations and project ve...,project descriptions dont store sharing recomm...,1
3,61,60,.vcmmeta showing as change gdqtgw,useruser install drop into declipse user ...,need custom .vcmignore comparemerge gdqt,useruser install drop into declipse user ...,1
4,94,2,repositories view all file types open to the t...,when browsing files in the repositories view i...,opening repository resources doesnt honor type...,opening repository resource open the default ...,1
...,...,...,...,...,...,...,...
12681,423034,287720,.metadata.log error,please see my .log file please fix this error,eclipse crashes while startup,id .. .eclipse crashes on startup . . more...,1
12682,423852,422971,workbench classcastexception handlerprocessing...,backport into .. this bug was initially crea...,workbench classcastexception handlerprocessing...,backport to .. this bug was initially create...,1
12683,423888,413977,keybindings resizing content assist proposal p...,tested on newly installed standard kepler sr o...,keybindings all nonnative key bindings stop wo...,. and latest n. . paste the following snippet...,1
12684,424120,418254,close window from context menu,context menu eclipse.id...m java.version.. ja...,editormgmt keybindings ctrle and delete causes...,after updating my kepler eclipse .. to service...,1


In [28]:
print("Eclipse dataset shape: ",df_ecl.shape)
print("Mozilla dataset shape: ",df_moz.shape)
print("Thunderbird dataset shape: ",df_tb.shape)

Eclipse dataset shape:  (46908, 7)
Mozilla dataset shape:  (60904, 7)
Thunderbird dataset shape:  (14263, 7)


In [29]:
# Counting the unique values in label for the 3 datasets
print(df_ecl['Label'].value_counts())
print(df_moz['Label'].value_counts())
print(df_tb['Label'].value_counts())

0    34222
1    12686
Name: Label, dtype: int64
0    36833
1    24071
Name: Label, dtype: int64
0    9905
1    4358
Name: Label, dtype: int64


### Combining the text in Title and description column for all datasets in order to vectorize them together

In [30]:
df_ecl['Bug1'] = df_ecl['Title1'].str.cat(df_ecl['Description1'],sep=" ")
df_ecl['Bug2'] = df_ecl['Title2'].str.cat(df_ecl['Description2'],sep=" ")

df_moz['Bug1'] = df_moz['Title1'].str.cat(df_moz['Description1'],sep=" ")
df_moz['Bug2'] = df_moz['Title2'].str.cat(df_moz['Description2'],sep=" ")

df_tb['Bug1'] = df_tb['Title1'].str.cat(df_tb['Description1'],sep=" ")
df_tb['Bug2'] = df_tb['Title2'].str.cat(df_tb['Description2'],sep=" ")
df_ecl.Bug1

0        usability issue with external editors geirl se...
1        opening repository resources doesnt honor type...
2        sync does not indicate deletion gien kmpm \tth...
3        need better error message if catching up over ...
4        isharingmanager sharing api inconsistent gaulh...
                               ...                        
12681    .metadata.log error please see my .log file  p...
12682    workbench classcastexception handlerprocessing...
12683    keybindings resizing content assist proposal p...
12684    close window from context menu context menu  e...
12685    viewmgmt findviewreference always returns null...
Name: Bug1, Length: 46908, dtype: object

### Remove puncctuations and stop words from the datasets to process

In [31]:
stop_words = set(stopwords.words('english'))

def words(text):
    text = ''.join([i for i in text if i not in punctuation])
    text = text.lower()
    text = text.split()
    text = [j for j in text if not j in stop_words]
    text = " ".join(text)
    return(text)

In [32]:
def preprocess(bug_reports,bugs_all):
    for n in bugs_all:
        bug_reports.append(words(n))

In [33]:
bug_pp_ecl1 = []
bug_pp_ecl2 = []

bug_pp_moz1 = []
bug_pp_moz2 = []

bug_pp_tb1 = []
bug_pp_tb2 = []

preprocess(bug_pp_ecl1, df_ecl.Bug1)
preprocess(bug_pp_ecl2, df_ecl.Bug2)

preprocess(bug_pp_moz1, df_moz.Bug1)
preprocess(bug_pp_moz2, df_moz.Bug2)

preprocess(bug_pp_tb1, df_tb.Bug1)
preprocess(bug_pp_tb2, df_tb.Bug2)

### Define TFIDF veectorizer

In [34]:
tfidf_ecl = TfidfVectorizer(analyzer = 'word',stop_words = 'english',lowercase = True,max_features = 700,norm = 'l2')
tfidf_moz = TfidfVectorizer(analyzer = 'word',stop_words = 'english',lowercase = True,max_features = 700,norm = 'l2')
tfidf_tb = TfidfVectorizer(analyzer = 'word',stop_words = 'english',lowercase = True,max_features = 700,norm = 'l2')

In [35]:
words_ecl = pd.concat([df_ecl.Bug1,df_ecl.Bug2], axis = 0)
words_moz = pd.concat([df_moz.Bug1,df_moz.Bug2], axis = 0)
words_tb = pd.concat([df_tb.Bug1,df_tb.Bug2], axis = 0)

In [36]:
tfidf_ecl.fit(words_ecl)
tfidf_moz.fit(words_moz)
tfidf_tb.fit(words_tb)

duplicate_ecl_1 = tfidf_ecl.transform(df_ecl.Bug1)
duplicate_ecl_2 = tfidf_ecl.transform(df_ecl.Bug2)

duplicate_moz_1 = tfidf_moz.transform(df_moz.Bug1)
duplicate_moz_2 = tfidf_moz.transform(df_moz.Bug2)

duplicate_tb_1 = tfidf_tb.transform(df_tb.Bug1)
duplicate_tb_2 = tfidf_tb.transform(df_tb.Bug2)

In [37]:
x_ecl = abs(duplicate_ecl_1 - duplicate_ecl_2)
y_ecl = df_ecl['Label']

x_moz = abs(duplicate_moz_1 - duplicate_moz_2)
y_moz = df_moz['Label']

x_tb = abs(duplicate_tb_1 - duplicate_tb_2)
y_tb = df_tb['Label']

In [38]:
x_ecl_train, x_ecl_test, y_ecl_train, y_ecl_test = train_test_split(x_ecl, y_ecl, test_size=0.3)

x_moz_train, x_moz_test, y_moz_train, y_moz_test = train_test_split(x_moz, y_moz, test_size=0.3)

x_tb_train, x_tb_test, y_tb_train, y_tb_test = train_test_split(x_tb, y_tb, test_size=0.3)

## Logistic Regression

In [39]:
lr_ecl = LogisticRegression(max_iter=1000)
lr_ecl.fit(x_ecl_train, y_ecl_train)

lr_moz = LogisticRegression(max_iter=1000)
lr_moz.fit(x_moz_train, y_moz_train)

lr_tb = LogisticRegression(max_iter=1000)
lr_tb.fit(x_tb_train, y_tb_train)

LogisticRegression(max_iter=1000)

In [40]:
y_pred_ecl = lr_ecl.predict(x_ecl_test) 
y_pred_moz = lr_moz.predict(x_moz_test) 
y_pred_tb = lr_tb.predict(x_tb_test)

In [41]:
def print_evalreport(y_test,y_pred):
    confmatrix = confusion_matrix(y_test,y_pred)
    TP = confmatrix[1][1]
    TN = confmatrix[0][0]
    FP = confmatrix[1][0]
    FN = confmatrix[0][1]
    accuracy = (TP + TN)/(TP + TN + FP + FN)
    print("\nTrue Positive  : ", TP)
    print("True Negative  : ", TN)
    print("False Positive : ", FP)
    print("False Negative : ", FN)
    print("Accuracy       : ",accuracy)

print_evalreport(y_ecl_test,y_pred_ecl)
print_evalreport(y_moz_test,y_pred_moz)
print_evalreport(y_tb_test,y_pred_tb)


True Positive  :  2010
True Negative  :  9609
False Positive :  1789
False Negative :  665
Accuracy       :  0.8256235344276274

True Positive  :  5033
True Negative  :  9469
False Positive :  2202
False Negative :  1568
Accuracy       :  0.7936733800350263

True Positive  :  832
True Negative  :  2682
False Positive :  491
False Negative :  274
Accuracy       :  0.8212199111942042


### Random Forest Classifier

In [42]:
rf_ecl = RandomForestClassifier(max_depth=20)
rf_ecl.fit(x_ecl_train, y_ecl_train)

rf_moz = RandomForestClassifier(max_depth=20)
rf_moz.fit(x_moz_train, y_moz_train)

rf_tb = RandomForestClassifier(max_depth=20)
rf_tb.fit(x_tb_train, y_tb_train)

RandomForestClassifier(max_depth=20)

In [43]:
y_pred_ecl = rf_ecl.predict(x_ecl_test) 
y_pred_moz = rf_moz.predict(x_moz_test) 
y_pred_tb = rf_tb.predict(x_tb_test)

In [44]:
print_evalreport(y_ecl_test,y_pred_ecl)
print_evalreport(y_moz_test,y_pred_moz)
print_evalreport(y_tb_test,y_pred_tb)


True Positive  :  335
True Negative  :  10240
False Positive :  3464
False Negative :  34
Accuracy       :  0.751438925602217

True Positive  :  3114
True Negative  :  10318
False Positive :  4121
False Negative :  719
Accuracy       :  0.7351138353765324

True Positive  :  289
True Negative  :  2934
False Positive :  1034
False Negative :  22
Accuracy       :  0.7532133676092545


### Decision Tree Classifier

In [45]:
dt_ecl = tree.DecisionTreeClassifier()
dt_ecl = dt_ecl.fit(x_ecl_train, y_ecl_train)

dt_moz = tree.DecisionTreeClassifier()
dt_moz = dt_moz.fit(x_moz_train, y_moz_train)

dt_tb = tree.DecisionTreeClassifier()
dt_tb = dt_tb.fit(x_tb_train, y_tb_train)

In [46]:
y_pred_ecl = dt_ecl.predict(x_ecl_test) 
y_pred_moz = dt_moz.predict(x_moz_test) 
y_pred_tb = dt_tb.predict(x_tb_test)

In [47]:
print_evalreport(y_ecl_test,y_pred_ecl)
print_evalreport(y_moz_test,y_pred_moz)
print_evalreport(y_tb_test,y_pred_tb)


True Positive  :  1723
True Negative  :  8087
False Positive :  2076
False Negative :  2187
Accuracy       :  0.6970795139629077

True Positive  :  4188
True Negative  :  7803
False Positive :  3047
False Negative :  3234
Accuracy       :  0.65625

True Positive  :  668
True Negative  :  2186
False Positive :  655
False Negative :  770
Accuracy       :  0.6669782659499883
