In [113]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef
import pandas as pd
import numpy as np

##Data processing

In [114]:
def getDataClass(df, dataColumnName, labelColumnName):
  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  train_classes = ['']*4

  n = np.size(X)

  for i in range(n):
    if (y[i] == 'B'):
      train_classes[0] += str(X[i]) + '. '
    
    if (y[i] == 'D'):
      train_classes[1] += str(X[i]) + '. '
    
    if (y[i] == 'I'):
      train_classes[2] += str(X[i]) + '. '
    
    if (y[i] == 'P'):
      train_classes[3] += str(X[i]) + '. '
    
  return train_classes


In [115]:
def getDataAndLabel(df, dataColumnName, labelColumnName):

  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  n = np.size(X)

  X_train = ['']*n
  y_train = ['']*n

  for i in range(n):
    X_train[i] += str(X[i])
    y_train[i] += str(y[i])

  return X_train, y_train

##Label functions

In [116]:
def getTfidf(document, stopWords):

  tfidf = TfidfVectorizer(input = document, stop_words = stopWords)

  X_train_transformed = tfidf.fit_transform(document)

  terms = tfidf.get_feature_names_out()

  # Sum tfidf frequency of each term through documents
  sums = X_train_transformed.sum(axis=0)

  # Connecting term to its sums frequency
  data = []
  for col, term in enumerate(terms):
      data.append( (term, sums[0,col], col ))
  M = np.transpose(X_train_transformed.toarray())

  ranking = pd.DataFrame(data, columns=['term','rank', 'ID'])
  ID_sorted = np.transpose(ranking.sort_values('rank', ascending=False).values)[2]


  terms_sorted = []
  for i, ID in enumerate(ID_sorted):
    terms_sorted.append([terms[ID], ID])

  return tfidf, terms_sorted, M

In [117]:
def predict(X, y, terms_sorted, M):

  pred = []
  
  for review, label in zip(X, y):
    weights = [0, 0, 0, 0]
    for term, ID in terms_sorted:
      if term in review:
        weights += M[ID]
    labels = ["B", "D", "I", "P"]
    pred.append( labels[np.argmax(weights)] )

  return pred


##Evaluation on Testset

In [118]:
def evalOnDataset(df_train, df_test):
  train_classes = getDataClass(df_train, 'review', 'kano_labels')
  X_test, y_test = getDataAndLabel(df_test, 'review', 'kano_labels')

  tfidf, terms_sorted, M = getTfidf(train_classes, 'english')

  y_pred = predict(X_test, y_test, terms_sorted, M)

  acc = accuracy_score(y_test, y_pred)
  prec, rec, f1, supp = precision_recall_fscore_support(y_test, y_pred)

  print("accuracy:\t", acc)
  print("precision:\t", prec)
  print("recall:\t", rec)
  print("f1 score:\t", f1)

  df_test['prediction'] = y_pred
  df_test.to_excel("Brunotte_with_predictions_test" + str(i) + "_keyword_driven.xlsx")

  return acc, prec, rec, f1

##10-fold cross-validation

In [119]:
def crossvalidation10fold(train_data):
  # 10 fold cross validation
  n = 10
  kf = KFold(n_splits=n, random_state = 42, shuffle = True)

  accs = []
  precs = []
  recs = []
  f1s = []

  for train_index, val_index in kf.split(train_data):
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]

    train_classes =  getDataClass(train_df, 'review', 'kano_labels')
    tfidf, terms_sorted, M = getTfidf(train_classes, 'english')
    
    # Evaluating on the whole val_df
    X_test, y_test = getDataAndLabel(val_df, 'review', 'kano_labels')
    
    y_pred = predict(X_test, y_test, terms_sorted, M)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred)

    accs.append(acc)
    precs.append(prec)
    recs.append(rec)
    f1s.append(f1)

  prec = np.mean(precs, axis = 0)
  rec = np.mean(recs, axis = 0)
  f1 = np.mean(f1s, axis = 0)
  acc = np.mean(accs)

  return acc, prec, rec, f1

In [120]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
for i in range(1,6):
  # Import of the datasets
  training_dataset = pd.read_excel('/content/drive/MyDrive/KANO Modell Studie/Colab Notebooks/downsamples_tests/test' + str(i) + '/DATASET_downsampled_test' + str(i) + '.xlsx')
  test_dataset = pd.read_excel('/content/drive/MyDrive/KANO Modell Studie/Datasets/LabeledDatasets/Trainingskorpus_Final.xlsx')
  combined_dataset = pd.read_excel('/content/drive/MyDrive/KANO Modell Studie/Colab Notebooks/downsamples_tests/test' + str(i) + '/DATASET_Trainingskorpus_combined_test' + str(i) + '.xlsx')

  training_dataset_prec_scores = []
  training_dataset_rec_scores = []
  training_dataset_f1_scores = []
  training_dataset_acc_scores = []

  test_dataset_prec_scores = []
  test_dataset_rec_scores = []
  test_dataset_f1_scores = []
  test_dataset_acc_scores = []

  combined_dataset_prec_scores = []
  combined_dataset_rec_scores = []
  combined_dataset_f1_scores = []
  combined_dataset_acc_scores = []

  print("======================================\
  ITERATION " + str(i) + "\
  ======================================\n")

  #Run training on training set, evaluation on test set
  print("--------------------------\
  TRAINING ON TRAINING SET, EVALUATION ON TEST SET\
  -------------------------")
  acc, prec, rec, f1 = evalOnDataset(training_dataset, test_dataset)
  print(acc, prec, rec, f1)
  training_dataset_prec_scores.append(prec)
  training_dataset_rec_scores.append(rec)
  training_dataset_f1_scores.append(f1)
  training_dataset_acc_scores.append(acc)
  print("\n\n")

  #Perform a 10-fold cross-validation on the Murat Dataset
  print("--------------------------\
  CROSS-VALIDATION ON TRAINING SET\
  -------------------------")
  acc, prec, rec, f1 = crossvalidation10fold(training_dataset)
  print(acc, prec, rec, f1)
  test_dataset_prec_scores.append(prec)
  test_dataset_rec_scores.append(rec)
  test_dataset_f1_scores.append(f1)
  test_dataset_acc_scores.append(acc)
  print("\n\n")

  #Perform a 10-fold cross-validation on the combined Dataset
  print("--------------------------\
  CROSS-VALIDATION ON COMBINED SET\
  -------------------------")
  acc, prec, rec, f1 = crossvalidation10fold(combined_dataset)
  print(acc, prec, rec, f1)
  combined_dataset_prec_scores.append(prec)
  combined_dataset_rec_scores.append(rec)
  combined_dataset_f1_scores.append(f1)
  combined_dataset_acc_scores.append(acc)
  print("\n\n\n\n")
  print("\n\n\n\n")


training_dataset_prec = np.mean(training_dataset_prec_scores, axis = 0)
training_dataset_rec = np.mean(training_dataset_rec_scores, axis = 0)
training_dataset_f1 = np.mean(training_dataset_f1_scores, axis = 0)
training_dataset_acc = np.mean(training_dataset_acc_scores)

test_dataset_prec = np.mean(test_dataset_prec_scores, axis = 0)
test_dataset_rec = np.mean(test_dataset_rec_scores, axis = 0)
test_dataset_f1 = np.mean(test_dataset_f1_scores, axis = 0)
test_dataset_acc = np.mean(test_dataset_acc_scores)

combined_dataset_prec = np.mean(combined_dataset_prec_scores, axis = 0)
combined_dataset_rec = np.mean(combined_dataset_rec_scores, axis = 0)
combined_dataset_f1 = np.mean(combined_dataset_f1_scores, axis = 0)
combined_dataset_acc = np.mean(combined_dataset_acc_scores)

print("Training Precision, Recall, F1, Accuracy: ", training_dataset_prec, training_dataset_rec, training_dataset_f1, training_dataset_acc)
print("Test Precision, Recall, F1, Accuracy: ", test_dataset_prec, test_dataset_rec, test_dataset_f1, test_dataset_acc)
print("Cmbined Precision, Recall, F1, Accuracy: ", combined_dataset_prec, combined_dataset_rec, combined_dataset_f1, combined_dataset_acc)


--------------------------  TRAINING ON TRAINING SET, EVALUATION ON TEST SET  -------------------------
accuracy:	 0.5363748458692972
precision:	 [0.82713085 0.12366738 0.0625     0.42013889]
recall:	 [0.62522686 0.61052632 0.06666667 0.30632911]
f1 score:	 [0.7121447  0.20567376 0.06451613 0.35431918]
0.5363748458692972 [0.82713085 0.12366738 0.0625     0.42013889] [0.62522686 0.61052632 0.06666667 0.30632911] [0.7121447  0.20567376 0.06451613 0.35431918]



--------------------------  CROSS-VALIDATION ON TRAINING SET  -------------------------
0.5366394416394417 [0.61958058 0.44172063 0.68812349 0.46898983] [0.76937836 0.47907285 0.36076961 0.53783798] [0.68473533 0.45723294 0.47180999 0.4991242 ]



--------------------------  CROSS-VALIDATION ON COMBINED SET  -------------------------
0.4926072808623143 [0.48707215 0.45113169 0.81084876 0.45764562] [0.84926253 0.43203131 0.21720771 0.47503148] [0.61750878 0.43706718 0.33892793 0.46337745]











--------------------------  TRA