In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Importing modules
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


from sklearn.model_selection import StratifiedKFold

#Evaluation Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from statistics import mean
import time


In [None]:
tech_debt_df = pd.read_csv('/content/drive/My Drive/tech_debt/technical_debt_dataset.csv',header=0,encoding='latin-1')#'utf-8')

In [None]:
len(tech_debt_df)

62275

In [None]:
tech_debt_df['classification'].value_counts()

WITHOUT_CLASSIFICATION    58204
DESIGN                     2703
IMPLEMENTATION              757
DEFECT                      472
TEST                         85
DOCUMENTATION                54
Name: classification, dtype: int64

In [None]:
tech_debt_df.loc[tech_debt_df.classification == "DEFECT",'tag']=int(1)
tech_debt_df.loc[tech_debt_df.classification == "DESIGN",'tag']=int(1)
tech_debt_df.loc[tech_debt_df.classification == "DOCUMENTATION",'tag']=int(1)
tech_debt_df.loc[tech_debt_df.classification == "IMPLEMENTATION",'tag']=int(1)
tech_debt_df.loc[tech_debt_df.classification == "TEST",'tag']=int(1)
tech_debt_df.loc[tech_debt_df.classification == "WITHOUT_CLASSIFICATION",'tag']=int(0)

In [None]:
tech_debt_df['tag']=LabelEncoder().fit_transform(tech_debt_df.tag)


In [None]:
tech_debt_df['tag'].value_counts()

0    58204
1     4071
Name: tag, dtype: int64

In [None]:
index = tech_debt_df.set_index('projectname')
#print(index)

jedit_df=index.loc[['jEdit-4.2']]
argouml_df=index.loc[['argouml']]
jmeter_df=index.loc[['apache-jmeter-2.10']]
sql_df=index.loc[['sql12']]
columba_df=index.loc[['columba-1.4-src']]
jruby_df=index.loc[['jruby-1.4.0']]
jfreechart_df=index.loc[['jfreechart-1.0.19']]
emf_df=index.loc[['emf-2.4.1']]
ant_df=index.loc[['apache-ant-1.7.0']]
hibernate_df=index.loc[['hibernate-distribution-3.3.2.GA']]
consolidated=index.loc[['jEdit-4.2','argouml','apache-jmeter-2.10','sql12','columba-1.4-src','jruby-1.4.0','jfreechart-1.0.19','emf-2.4.1','apache-ant-1.7.0','hibernate-distribution-3.3.2.GA'], ['classification', 'commenttext','tag']]


In [None]:
import re
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Removes punctuation and special characters
def  clean_text(df, text_field, new_text_field_name):
    print(len(df)) 
    #removing empty rows and rows whose comment text <= 2 chars
    #nan_value = float("NaN")
    #df.replace("", nan_value, inplace=True)    
    #df[text_field] = df[text_field].apply(lambda x: nan_value if len(x) <=2 else x )     
    df.dropna(subset = [text_field], inplace=True)
    print(len(df))
    #to lower case and remove hyperslinks and multiple spaces
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    
    return df

In [None]:
len(consolidated)

62244

In [None]:
consolidated = clean_text(consolidated, 'commenttext', 'commenttext')
consolidated['commenttext'] = consolidated['commenttext'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

62244
53809


In [None]:
consolidated.dropna(subset = ['commenttext'], inplace=True)

In [None]:
len(consolidated)

53809

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

consolidated['commenttext'] = consolidated['commenttext'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
consolidated['commenttext'] = consolidated['commenttext'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
consolidated['commenttext'] = consolidated['commenttext'].apply(lambda x: ' '.join([word for word in x.split() if len(word) < 15 ]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
tst_rslt_dic={}
model_history = []
best_model_ind=0
val_conf_mat_tp=[]
cross_val_prec_list=[]
cross_val_rcl_list=[]
cross_val_f1_list=[]
cross_val_rocauc_list=[]
cross_val_sens_list=[]
cross_val_spec_list=[]
cross_val_gm_list=[]

def model_fitting(train_x, val_x, train_y, val_y):
  logging.info("Fitting a XGB Model...")
  scikit_log_reg = xgb.XGBClassifier()
  model=scikit_log_reg.fit(train_x,train_y)
  event_time=time.time()
  train_pred_y=model.predict(train_x)
  conf_mat = confusion_matrix(train_y, train_pred_y)
  logging.info("Cross-Validating with validation data...")
  event_time=time.time()
  val_pred_y=model.predict(val_x)
  ##Sensitivity -- Recall of +ve class (in binary classification)
  ##Specificity -- Recall of -ve class (in binary classification)
  logging.info("Cross-Validation Prediction time: %s",(time.time()-event_time))
  val_conf_mat=confusion_matrix(val_y, val_pred_y)
  logging.info("%s",val_conf_mat)
  val_conf_mat_tp.append(val_conf_mat[1][1])
  cross_val_prec_list.append(precision_score(val_y, val_pred_y, average='binary'))
  cross_val_rcl_list.append(recall_score(val_y, val_pred_y, average='binary'))
  cross_val_f1_list.append(f1_score(val_y, val_pred_y, average='binary'))
  cross_val_rocauc_list.append(roc_auc_score(val_y, val_pred_y))
  return model



In [None]:
len(Y_train_data)

49795

In [None]:
X=consolidated['commenttext']
Y=consolidated['tag']
X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X, Y,test_size=0.2,random_state=33)

In [None]:
from collections import Counter

tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
tfidf_vectorizer.fit_transform(X_train_data)
X=tfidf_vectorizer.transform(X_train_data)
X_test_data_vectorized=tfidf_vectorizer.transform(X_test_data)
fnames_bs=tfidf_vectorizer.get_feature_names()

print(Counter(Y_train_data))
print(Counter(Y_test_data))
print(X.shape)  
print(X_test_data_vectorized.shape)

#X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X, Y,stratify=Y, test_size=0.2)

Counter({0: 39753, 1: 3294})
Counter({0: 9986, 1: 776})
(43047, 13567)
(10762, 13567)


In [190]:
X_test_data.head(10)

projectname
jEdit-4.2                                                    create app start script
hibernate-distribution-3.3.2.GA    pojo tuplizer using dynamicmap tuplizer pojo r...
apache-ant-1.7.0                                             anything else classname
apache-jmeter-2.10                                                            nonnls
columba-1.4-src                                   get current message list selection
sql12                              strange release xml file wasnt found problem p...
sql12                                                      execute sql synchronously
columba-1.4-src                                                 nonnls nonnls nonnls
argouml                                                               todo implement
jruby-1.4.0                                                   cant support rest java
Name: commenttext, dtype: object

In [187]:
def test_stats(model,test_inp,ground_truth_inp,senti_label):
    rslt={}
    tst_proj_ky='test'
    tst_df=pd.DataFrame()
    global tst_rslt_dic,mstr_tst_df
    event_time=time.time()
    
    y_class = model.predict(test_inp)
    logging.info("-prediction time- %s seconds ---" % (time.time() - event_time))
    if(len(senti_label) > 0):
      logging.info("processing sentiment results")
      logging.info("len of y_class, type of y_class and senti_label %s,%s,%s",len(y_class),type(y_class),type(senti_label))
      senti_ml_combined_bu_df=pd.DataFrame({"y_class": y_class,"senti_label": senti_label})
      senti_ml_combined_bu_df.to_csv('/content/drive/My Drive/tech_debt/senti_ml_labels_bu.tsv', sep = '\t',encoding='latin-1')      
      for y_class_ind in range(len(y_class)):
        if y_class[y_class_ind] == 0 and senti_label[y_class_ind] == 1:
          y_class[y_class_ind]=1
        senti_ml_combined_df=pd.DataFrame({"y_class": y_class,"senti_label": senti_label})
        senti_ml_combined_df.to_csv('/content/drive/My Drive/tech_debt/senti_ml_labels.tsv', sep = '\t',encoding='latin-1')
    print("type of y_class:",type(y_class),len(y_class))
    y_pred_prob =  model.predict_proba(test_inp)[:,1]
    logging.info("-inference time- %s seconds ---" % (time.time() - event_time))
    logging.info(classification_report(ground_truth_inp, y_class)) 
    logging.info(precision_recall_fscore_support(ground_truth_inp,y_class,average='binary'))
    logging.info(confusion_matrix(ground_truth_inp, y_class))
	
    rslt["prec"]=round(precision_score(ground_truth_inp, y_class, average='binary'),3)
    rslt["rcl"]=round(recall_score(ground_truth_inp, y_class, average='binary'),3)
    rslt["f1"]=round(f1_score(ground_truth_inp, y_class,average='binary'),3)
    rslt["roc_auc"]=round(roc_auc_score(ground_truth_inp,y_class),3)

    tst_rslt_dic[tst_proj_ky]=rslt
    
    logging.info("%s :  Precision: %s ",tst_proj_ky,rslt["prec"])
    logging.info("%s :  Recall: %s ",tst_proj_ky,rslt["rcl"])
    logging.info("%s :  F1: %s ",tst_proj_ky,rslt["f1"])
    logging.info("%s :  ROC-AUC: %s ",tst_proj_ky,rslt["roc_auc"])



In [188]:
senti_flag=0
def xgb_model(senti_flag,senti_rslts):
    
    cv_prec_lst=[]
    cv_rcl_lst=[]
    cv_f1_lst=[]
    cv_roc_auc_lst=[]
    cv_spec_lst=[]
    cv_sens_lst=[]
    cv_gm_lst=[]
    
    global val_conf_mat_tp,cross_val_prec_list,cross_val_rcl_list,cross_val_f1_list,cross_val_rocauc_list,cv_rslt_dic,mstr_tst_df
    global cross_val_sens_list,cross_val_spec_list,cross_val_gm_list,model_history,best_model_ind,tst_rslt_dic,tst_rslt_mstr_dic,scale_positive_weight
    global bal_tech
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y_train_data,test_size=0.1,random_state=33)
    event_time=time.time()
    model_history.append(model_fitting(X_train, X_test, Y_train, Y_test))
    logging.info("-Training time- %s seconds ---" % (time.time() - event_time))
    best_model_ind= 0#np.argmax(val_conf_mat_tp)
    #logging.info("Best Performing Model index: %s",best_model_ind)
    logging.info("CV: Precision: %s",cross_val_prec_list[best_model_ind])
    logging.info("CV: Recall: %s",cross_val_rcl_list[best_model_ind])
    logging.info("CV: F1: %s",cross_val_f1_list[best_model_ind])
    logging.info("CV: ROC-AUC: %s",cross_val_rocauc_list[best_model_ind])
    #logging.info("Geometric Mean: %s",cross_val_gm_list[best_model_ind])
    #logging.info("Sensitivity: %s",cross_val_sens_list[best_model_ind])
    #logging.info("Specificity: %s",cross_val_spec_list[best_model_ind])
    logging.info("*********")
    if senti_flag :
      test_stats(model_history[best_model_ind],X_test_data_vectorized,Y_test_data,senti_rslts)
    else:
      test_stats(model_history[best_model_ind],X_test_data_vectorized,Y_test_data,pd.Series())
    val_conf_mat_tp=[]
    cross_val_prec_list=[]
    cross_val_rcl_list=[]
    cross_val_f1_list=[]
    cross_val_rocauc_list=[]
    cross_val_sens_list=[]
    cross_val_spec_list=[]
    cross_val_gm_list=[]
    model_history = []
    best_model_ind=0    
          
 
xgb_model(senti_flag,pd.Series())
print("******")



2020-10-28 17:31:45,004 : INFO : Fitting a XGB Model...
2020-10-28 17:31:47,961 : INFO : Cross-Validating with validation data...
2020-10-28 17:31:47,979 : INFO : Cross-Validation Prediction time: 0.0169525146484375
2020-10-28 17:31:47,985 : INFO : [[3918   26]
 [  82  279]]
2020-10-28 17:31:47,994 : INFO : -Training time- 2.9899649620056152 seconds ---
2020-10-28 17:31:47,995 : INFO : CV: Precision: 0.9147540983606557
2020-10-28 17:31:47,996 : INFO : CV: Recall: 0.7728531855955678
2020-10-28 17:31:47,997 : INFO : CV: F1: 0.8378378378378377
2020-10-28 17:31:47,998 : INFO : CV: ROC-AUC: 0.8831304467531591
2020-10-28 17:31:48,000 : INFO : *********
2020-10-28 17:31:48,031 : INFO : -prediction time- 0.02950739860534668 seconds ---
2020-10-28 17:31:48,065 : INFO : -inference time- 0.06292152404785156 seconds ---
2020-10-28 17:31:48,081 : INFO :               precision    recall  f1-score   support

           0       0.98      0.99      0.99      9986
           1       0.91      0.76     

type of y_class: <class 'numpy.ndarray'> 10762
******


In [None]:
test_df=pd.DataFrame({"test_data": X_test_data,"label": Y_test_data})
test_df.index =np.arange(1,len(test_df)+1)
test_df['id']=test_df.index + 1

#type(X_test_data)

In [None]:
test_df.shape

(10762, 3)

In [None]:
test_df.head(10)

Unnamed: 0,test_data,label,id
1,create app start script,0,2
2,pojo tuplizer using dynamicmap tuplizer pojo r...,0,3
3,anything else classname,0,4
4,nonnls,0,5
5,get current message list selection,0,6
6,strange release xml file wasnt found problem p...,0,7
7,execute sql synchronously,0,8
8,nonnls nonnls nonnls,0,9
9,todo implement,1,10
10,cant support rest java,0,11


In [None]:
test_df.head(10)

Unnamed: 0,test_data,label,id
1,create app start script,0,2
2,pojo tuplizer using dynamicmap tuplizer pojo r...,0,3
3,anything else classname,0,4
4,nonnls,0,5
5,get current message list selection,0,6
6,strange release xml file wasnt found problem p...,0,7
7,execute sql synchronously,0,8
8,nonnls nonnls nonnls,0,9
9,todo implement,1,10
10,cant support rest java,0,11


In [None]:
test_df.columns

Index(['test_data', 'label', 'id'], dtype='object')

In [None]:
test_df.rename(columns={1:'test_data',
                          2:'label',
                          3:'id'}, 
                 inplace=True)

In [None]:
len(test_df)

10762

In [None]:
 test_df.dropna(subset = ['test_data'], inplace=True)

In [None]:
len(test_df)

10762

In [None]:
test_df.to_csv('/content/drive/My Drive/tech_debt/xgboost_testdata_for_sentistrength.tsv', sep = '\t',encoding='latin-1')

In [None]:
test_senti_strength_rslts_df = pd.read_csv('/content/drive/My Drive/tech_debt/xgboost_testdata_for_sentistrength_classID.csv',sep = '\t')#,names=["ID","pos_score", "neg_score"])#'utf-8')

In [None]:
print(len(test_senti_strength_rslts_df))

10762


In [None]:
test_senti_strength_rslts_df['senti_combined_score'] =  test_senti_strength_rslts_df[['pos_score', 'neg_score']].sum(axis=1)

In [None]:
test_senti_strength_rslts_df.head(10)

Unnamed: 0,ID,pos_score,neg_score,senti_combined_score
0,2,1,-1,0
1,3,2,-1,1
2,4,1,-1,0
3,5,1,-1,0
4,6,1,-1,0
5,7,1,-2,-1
6,8,1,-2,-1
7,9,1,-1,0
8,10,1,-1,0
9,11,1,-2,-1


In [None]:
test_senti_strength_rslts_df.loc[test_senti_strength_rslts_df.senti_combined_score <= -1,'binary_sentiment']=1
test_senti_strength_rslts_df.loc[test_senti_strength_rslts_df.senti_combined_score >= 0, 'binary_sentiment']=0

In [None]:
test_senti_strength_rslts_df.head(13)

Unnamed: 0,ID,pos_score,neg_score,senti_combined_score,binary_sentiment
0,2,1,-1,0,0.0
1,3,2,-1,1,0.0
2,4,1,-1,0,0.0
3,5,1,-1,0,0.0
4,6,1,-1,0,0.0
5,7,1,-2,-1,1.0
6,8,1,-2,-1,1.0
7,9,1,-1,0,0.0
8,10,1,-1,0,0.0
9,11,1,-2,-1,1.0


In [None]:
len(test_df)

10762

In [None]:
len(test_df)

10762

In [None]:
test_df['pos_score']=test_senti_strength_rslts_df['pos_score'].values
test_df['neg_score']=test_senti_strength_rslts_df['neg_score'].values
test_df['binary_sentiment']=test_senti_strength_rslts_df['binary_sentiment'].values
#test_df['overall_score']=test_senti_strength_rslts_df['binary_result'].values

In [177]:
from sklearn.preprocessing import LabelEncoder
test_df['binary_sentiment']=LabelEncoder().fit_transform(test_df.binary_sentiment)

In [189]:
xgb_model(1,test_df['binary_sentiment'].values)

2020-10-28 17:32:05,464 : INFO : Fitting a XGB Model...
2020-10-28 17:32:08,496 : INFO : Cross-Validating with validation data...
2020-10-28 17:32:08,512 : INFO : Cross-Validation Prediction time: 0.015118837356567383
2020-10-28 17:32:08,518 : INFO : [[3918   26]
 [  82  279]]
2020-10-28 17:32:08,527 : INFO : -Training time- 3.063105344772339 seconds ---
2020-10-28 17:32:08,528 : INFO : CV: Precision: 0.9147540983606557
2020-10-28 17:32:08,530 : INFO : CV: Recall: 0.7728531855955678
2020-10-28 17:32:08,532 : INFO : CV: F1: 0.8378378378378377
2020-10-28 17:32:08,533 : INFO : CV: ROC-AUC: 0.8831304467531591
2020-10-28 17:32:08,535 : INFO : *********
2020-10-28 17:32:08,566 : INFO : -prediction time- 0.029683828353881836 seconds ---
2020-10-28 17:32:08,567 : INFO : processing sentiment results
2020-10-28 17:32:08,567 : INFO : len of y_class, type of y_class and senti_label 10762,<class 'numpy.ndarray'>,<class 'numpy.ndarray'>
2020-10-28 17:36:01,602 : INFO : -inference time- 233.065490484

type of y_class: <class 'numpy.ndarray'> 10762
