In [9]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

##Data processing

In [10]:
def getDataAndLabel(df, dataColumnName, labelColumnName):

  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  n = np.size(X)

  X_train = ['']*n
  y_train = ['']*n

  for i in range(n):
    X_train[i] += str(X[i])
    y_train[i] += str(y[i])

  return X_train, y_train

In [11]:
def getDataClass(df, dataColumnName, labelColumnName):
  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  train_classes = ['']*4

  n = np.size(X)

  for i in range(n):
    if (y[i] == 'B'):
      train_classes[0] += str(X[i]) + '. '
    
    if (y[i] == 'D'):
      train_classes[1] += str(X[i]) + '. '
    
    if (y[i] == 'I'):
      train_classes[2] += str(X[i]) + '. '
    
    if (y[i] == 'P'):
      train_classes[3] += str(X[i]) + '. '
    
  return train_classes

##Label functions

In [12]:
def predict(X_train, y_train, X_test, y_test, tfidf):

  X_train_transformed = tfidf.fit_transform(X_train)
  X_test_transformed = tfidf.transform(X_test)

  lr = LogisticRegression(max_iter = 1000)
  lr.fit(X_train_transformed, y_train)
  y_pred = lr.predict(X_test_transformed)

  return y_pred

##Evaluation on Testset

In [13]:
def evalOnDataset(df_train, df_test):
  X_train, y_train = getDataAndLabel(df_train, 'review', 'kano_labels')
  tfidf = TfidfVectorizer(input = getDataClass(df_train, 'review', 'kano_labels'), stop_words = "english")

  X_test, y_test = getDataAndLabel(df_test, 'review', 'kano_labels')

  y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

  acc = accuracy_score(y_test, y_pred)
  prec, rec, f1, supp = precision_recall_fscore_support(y_test, y_pred)

  print("accuracy:\t", acc)
  print("precision:\t", prec)
  print("recall:\t", rec)
  print("f1 score:\t", f1)

  df_test['prediction'] = y_pred
  df_test.to_excel("Brunotte_with_predictions_test" + str(i) + "_logistic_regression.xlsx")

  return acc, prec, rec, f1

##10-fold cross-validation

In [14]:
def crossvalidation10fold(train_data):
  # 10 fold cross validation
  n = 10
  kf = KFold(n_splits=n, random_state = 42, shuffle = True)

  tfidf = TfidfVectorizer(input = getDataClass(train_data, 'review', 'kano_labels'), stop_words = "english")

  accs = []
  precs = []
  recs = []
  f1s = []

  for train_index, val_index in kf.split(train_data):
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]

    X_train, y_train = getDataAndLabel(train_df, 'review', 'kano_labels')

    # Evaluating on the whole val_df
    X_test, y_test = getDataAndLabel(val_df, 'review', 'kano_labels')

    y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred)

    accs.append(acc)
    precs.append(prec)
    recs.append(rec)
    f1s.append(f1)

  prec = np.mean(precs, axis = 0)
  rec = np.mean(recs, axis = 0)
  f1 = np.mean(f1s, axis = 0)
  acc = np.mean(accs)

  return acc, prec, rec, f1

# Import Data

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
for i in range(1,6):
  # Import of the datasets
  training_dataset = pd.read_excel('/content/drive/MyDrive/KANO Modell Studie/Colab Notebooks/downsamples_tests/test' + str(i) + '/DATASET_downsampled_test' + str(i) + '.xlsx')
  test_dataset = pd.read_excel('/content/drive/MyDrive/KANO Modell Studie/Datasets/LabeledDatasets/Trainingskorpus_Final.xlsx')
  combined_dataset = pd.read_excel('/content/drive/MyDrive/KANO Modell Studie/Colab Notebooks/downsamples_tests/test' + str(i) + '/DATASET_Trainingskorpus_combined_test' + str(i) + '.xlsx')

  training_dataset_prec_scores = []
  training_dataset_rec_scores = []
  training_dataset_f1_scores = []
  training_dataset_acc_scores = []

  test_dataset_prec_scores = []
  test_dataset_rec_scores = []
  test_dataset_f1_scores = []
  test_dataset_acc_scores = []

  combined_dataset_prec_scores = []
  combined_dataset_rec_scores = []
  combined_dataset_f1_scores = []
  combined_dataset_acc_scores = []


  print("======================================\
  ITERATION " + str(i) + "\
  ======================================\n")

  #Run training on training set, evaluation on test set
  print("--------------------------\
  TRAINING ON TRAINING SET, EVALUATION ON TEST SET\
  -------------------------")
  acc, prec, rec, f1 = evalOnDataset(training_dataset, test_dataset)
  training_dataset_prec_scores.append(prec)
  training_dataset_rec_scores.append(rec)
  training_dataset_f1_scores.append(f1)
  training_dataset_acc_scores.append(acc)
  print("\n\n")

  #Perform a 10-fold cross-validation on the Murat Dataset
  print("--------------------------\
  CROSS-VALIDATION ON TRAINING SET\
  -------------------------")
  acc, prec, rec, f1 = crossvalidation10fold(training_dataset)
  test_dataset_prec_scores.append(prec)
  test_dataset_rec_scores.append(rec)
  test_dataset_f1_scores.append(f1)
  test_dataset_acc_scores.append(acc)
  print("\n\n")

  #Perform a 10-fold cross-validation on the combined Dataset
  print("--------------------------\
  CROSS-VALIDATION ON COMBINED SET\
  -------------------------")
  acc, prec, rec, f1 = crossvalidation10fold(combined_dataset)
  combined_dataset_prec_scores.append(prec)
  combined_dataset_rec_scores.append(rec)
  combined_dataset_f1_scores.append(f1)
  combined_dataset_acc_scores.append(acc)
  print("\n\n\n\n")
  print("\n\n\n\n")


training_dataset_prec = np.mean(training_dataset_prec_scores, axis = 0)
training_dataset_rec = np.mean(training_dataset_rec_scores, axis = 0)
training_dataset_f1 = np.mean(training_dataset_f1_scores, axis = 0)
training_dataset_acc = np.mean(training_dataset_acc_scores)

test_dataset_prec = np.mean(test_dataset_prec_scores, axis = 0)
test_dataset_rec = np.mean(test_dataset_rec_scores, axis = 0)
test_dataset_f1 = np.mean(test_dataset_f1_scores, axis = 0)
test_dataset_acc = np.mean(test_dataset_acc_scores)

combined_dataset_prec = np.mean(combined_dataset_prec_scores, axis = 0)
combined_dataset_rec = np.mean(combined_dataset_rec_scores, axis = 0)
combined_dataset_f1 = np.mean(combined_dataset_f1_scores, axis = 0)
combined_dataset_acc = np.mean(combined_dataset_acc_scores)

print("Training Precision, Recall, F1, Accuracy: ", training_dataset_prec, training_dataset_rec, training_dataset_f1, training_dataset_acc)
print("Test Precision, Recall, F1, Accuracy: ", test_dataset_prec, test_dataset_rec, test_dataset_f1, test_dataset_acc)
print("Cmbined Precision, Recall, F1, Accuracy: ", combined_dataset_prec, combined_dataset_rec, combined_dataset_f1, combined_dataset_acc)


--------------------------  TRAINING ON TRAINING SET, EVALUATION ON TEST SET  -------------------------
accuracy:	 0.5869297163995068
precision:	 [0.8699284  0.17759563 0.08333333 0.40575916]
recall:	 [0.6615245  0.68421053 0.1        0.39240506]
f1 score:	 [0.75154639 0.28199566 0.09090909 0.3989704 ]



--------------------------  CROSS-VALIDATION ON TRAINING SET  -------------------------



--------------------------  CROSS-VALIDATION ON COMBINED SET  -------------------------











--------------------------  TRAINING ON TRAINING SET, EVALUATION ON TEST SET  -------------------------
accuracy:	 0.62577065351418
precision:	 [0.87485908 0.19533528 0.04651163 0.48710602]
recall:	 [0.70417423 0.70526316 0.06666667 0.43037975]
f1 score:	 [0.7802916  0.30593607 0.05479452 0.45698925]



--------------------------  CROSS-VALIDATION ON TRAINING SET  -------------------------



--------------------------  CROSS-VALIDATION ON COMBINED SET  -------------------------











-------