In [12]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler


s = set()
def load_file(file_path, cur_data):

  with open(file_path) as json_file:
    raw_data = json.load(json_file)
    return convert_data(raw_data,cur_data)

def convert_data(raw_data,cur_data):
  data = cur_data

  for elem in raw_data:
    if elem['reviewId'] in s:
      continue
    else:
      new = ""
      if elem['title']:
        new = elem["stopwords_removal_lemmatization"]+ elem['title']
      else: 
        new = elem["stopwords_removal_lemmatization"]
      data.append([new, elem['rating'],elem["sentiScore_pos"], elem["label"]])
      s.add(elem['reviewId'])
      
  return data

def convert_label(df, labels):

  df['label_code'] = df['label']

  df = df.replace({'label_code':labels})
  return df


def get_combined_df(file_paths, labels):
  data = []
  for path in file_paths:
    data = load_file(path, data)
  
  s.clear()

  # Remove spaces and punctuation in lemma 
  df = pd.DataFrame(data, columns = ['lemma', 'rating', 'sent', 'label'])
  
  df['lemma'] = df['lemma'].str.replace("\r", " ")
  df['lemma'] = df['lemma'].str.replace("\n", " ")
  df['lemma'] = df['lemma'].str.replace("  ", " ")
  df['lemma'] = df['lemma'].str.replace('"', '')
  df['lemma'] = df['lemma'].str.lower()

  punctuation_signs = list("?:!.,;")
  for punct_sign in punctuation_signs:
    df['lemma'] = df['lemma'].str.replace(punct_sign, '')

  df =  convert_label(df,labels)
  count_class_0, count_class_1 = df.label_code.value_counts()
  df_class_0 = df[df['label_code'] == 0]
  df_class_1 = df[df['label_code'] == 1]
  df_class_0_under = df_class_0.sample(count_class_1)
  df_test_under = pd.concat([df_class_0_under,df_class_1], axis=0)

  return df_test_under


def apply_tfidf(X_train, x_test):
  # apply tfidf to lemma to create feature vector
  ngram_range = (1,2)
  max_features = 400 

  tfidf = TfidfVectorizer(encoding='utf-8',
                      ngram_range=ngram_range,
                      max_features=max_features,
                      min_df=1,
                      max_df=1.0,
                      norm='l2',
                      sublinear_tf=True)
  f_train = tfidf.fit_transform(X_train['lemma'])
  f_test = tfidf.transform(x_test['lemma'])
  feature_train = pd.DataFrame(f_train.toarray(), columns=tfidf.get_feature_names())
  feature_train['sent'] = X_train['sent']
  feature_train['rating'] = X_train['rating']

  feature_test = pd.DataFrame(f_test.toarray(), columns=tfidf.get_feature_names())
  feature_test['sent'] = X_test['sent'] 
  feature_test['rating'] = X_test['rating'] 

  feature_train.fillna(0.0,inplace=True)
  feature_test.fillna(0.0, inplace=True)

  return feature_train, feature_test 

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train_nb_model(feature_train, y_train, feature_test, y_test):

  model = MultinomialNB()

  model.fit(feature_train, y_train)


  print("The bug testing metrics is: ")
  print("accuracy " + str(accuracy_score(y_test, model.predict(feature_test))))
  print("f1 " + str(f1_score(y_test, model.predict(feature_test))))

In [14]:
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV

def train_svm_model(feature_train, y_train, feature_test, y_test):

  C = [1,10]
  gamma = [ 1, 10]
  degree = [1, 2, 3]
  kernel = [ 'linear','poly','rbf']
  probability = [True]
  random_grid = {'C': C,
                'kernel': kernel,
                'gamma': gamma,
                'degree': degree,
                'probability': probability
               }
  svc = svm.SVC(random_state=42)
  # Definition of the random search
  random_search = RandomizedSearchCV(estimator=svc,
                                     param_distributions=random_grid,
                                     n_iter=10,
                                     scoring='balanced_accuracy',
                                     cv=3,
                                     verbose=1,
                                     random_state=42)


  random_search.fit(feature_train, y_train)
  model = random_search.best_estimator_
  print(random_search.best_params_)
  model.fit(feature_train,y_train)
  print("accuracy " + str(accuracy_score(y_test, model.predict(feature_test))))
  print("f1 " + str(f1_score(y_test, model.predict(feature_test))))

In [15]:
# bug classifier data
df = get_combined_df(["Bug_tt.json", "Feature_tt.json", "Rating_tt.json", "UserExperience_tt.json"],
                     {
                        'Bug': 1,
                        'Not_Bug': 0,
                        'Not_Feature': 0,
                        'Feature': 0,
                        'Rating': 0,
                        'Not_Rating': 0,
                        'UserExperience': 0,
                        'Not_UserExperience': 0,
                      })

X_train, X_test, y_train, y_test = train_test_split(df[['sent','lemma','rating']],
                                                    df['label_code'],
                                                    test_size=0.15,
                                                    random_state=42)
feature_train, feature_test = apply_tfidf(X_train, X_test)
print("Bug classifier")
train_svm_model(feature_train, y_train, feature_test, y_test)

Bug classifier
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   16.9s finished


{'probability': True, 'kernel': 'linear', 'gamma': 1, 'degree': 3, 'C': 1}
accuracy 0.7247706422018348
f1 0.7


In [16]:
# feature classifier data
df = get_combined_df(["Feature_tt.json", "Rating_tt.json", "Bug_tt.json", "UserExperience_tt.json"],
                     {
                        'Bug': 0,
                        'Not_Bug': 0,
                        'Not_Feature': 0,
                        'Feature': 1,
                        'Rating': 0,
                        'Not_Rating': 0,
                        'UserExperience': 0,
                        'Not_UserExperience': 0,
                      })

X_train, X_test, y_train, y_test = train_test_split(df[['sent','lemma','rating']],
                                                    df['label_code'],
                                                    test_size=0.15,
                                                    random_state=42)
feature_train, feature_test = apply_tfidf(X_train, X_test)
print("Feature classifier")
train_svm_model(feature_train, y_train, feature_test, y_test)


Feature classifier
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   11.3s finished


{'probability': True, 'kernel': 'linear', 'gamma': 1, 'degree': 3, 'C': 1}
accuracy 0.7386363636363636
f1 0.7160493827160495


In [17]:
# rating classifier data
df = get_combined_df(["Rating_tt.json", "Bug_tt.json", "Feature_tt.json","UserExperience_tt.json"],
                     {
                        'Bug': 0,
                        'Not_Bug': 0,
                        'Not_Feature': 0,
                        'Feature': 0,
                        'Rating': 1,
                        'Not_Rating': 0,
                        'UserExperience': 0,
                        'Not_UserExperience': 0,
                      })

X_train, X_test, y_train, y_test = train_test_split(df[['sent','lemma','rating']],
                                                    df['label_code'],
                                                    test_size=0.15,
                                                    random_state=42)
feature_train, feature_test = apply_tfidf(X_train, X_test)
print("Rating classifier")
train_svm_model(feature_train, y_train, feature_test, y_test)


Rating classifier
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  2.3min finished


{'probability': True, 'kernel': 'linear', 'gamma': 1, 'degree': 3, 'C': 1}
accuracy 0.5818181818181818
f1 0.5490196078431373


In [18]:
# user experience classifier data
df = get_combined_df(["UserExperience_tt.json", "Bug_tt.json", "Feature_tt.json","Rating_tt.json"],
                     {
                        'Bug': 0,
                        'Not_Bug': 0,
                        'Not_Feature': 0,
                        'Feature': 0,
                        'Rating': 0,
                        'Not_Rating': 0,
                        'UserExperience': 1,
                        'Not_UserExperience': 0,
                      })


X_train, X_test, y_train, y_test = train_test_split(df[['sent','lemma','rating']],
                                                    df['label_code'],
                                                    test_size=0.15,
                                                    random_state=42)
feature_train, feature_test = apply_tfidf(X_train, X_test)
print("UserExperience classifier")
train_svm_model(feature_train, y_train, feature_test, y_test)


UserExperience classifier
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   17.6s finished


{'probability': True, 'kernel': 'linear', 'gamma': 1, 'degree': 3, 'C': 1}
accuracy 0.7247706422018348
f1 0.736842105263158
