In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# read in data
# train model on data
# test model
# run feature importance to understand categorisation?
# run on different dataset (?) - with categories etc.
# use LDA to see if we can identify what types of features users are after?
# use sentiment analysis to understand if characteristics are as expected of the different categories
# reduce dimensions and visualise the centroids

In [3]:
df = pd.read_excel("../papers/Kano-Model-Classification/datasets/Stanik_dataset/DATASET_not_downsampled.xlsx")

In [4]:
df.shape

(6070, 4)

In [5]:
df.columns

Index(['Unnamed: 0', 'review', 'kano_labels', 'labels'], dtype='object')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,kano_labels,labels
0,0,No good. It will let me make them but I can't ...,B,0
1,1,Can not edit PDF :-( I just wanna note somethi...,B,0
2,2,Not good Why i cannot clear chat history. Come...,P,3
3,3,Wish someone warned me before . . . . . . no s...,B,0
4,4,Just ok For God's sake we need a new icon and ...,B,0


In [7]:
# check numeric labels associated with only 1 kano_labels
df.groupby("labels")["kano_labels"].nunique()

labels
0    1
1    1
2    1
3    1
Name: kano_labels, dtype: int64

In [8]:
df.groupby("kano_labels").size()

# data doesn't seem imbalanced enough for undersampling
# B = Basic, D= Delighter, I = Irrelevant, P = Performance
# Naive classifier classifying everything as Irrelevant would achieve 40% acuracy

kano_labels
B    1440
D     648
I    2452
P    1530
dtype: int64

In [9]:
# feature creation
# options are: one hot encoding (binary), bag of words (count), tfidf (word frequency compared to other docs),  

In [42]:
# trying the embedding method: https://medium.com/@juanc.olamendy/unlocking-the-power-of-text-classification-with-embeddings-7bcbb5912790

from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")


def get_embedding(review):
    # TODO: meaning of the diff bits of this tokenizer
    inputs = tokenizer(review, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)

    # TODO: Why mean of last layer hidden states as sentence embedding
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

def get_centroid(embeddings):
    return np.mean(embeddings, axis=0)

def get_nearest_label(embedding, centroid_dict):
    nearest_label = ""
    nearest_label_distance = float("inf")
    for label, centroid in centroid_dict.items():
        label_distance = np.linalg.norm(embedding - centroid)
        if label_distance < nearest_label_distance:
            nearest_label = label
            nearest_label_distance = label_distance

    return pd.Series([nearest_label, nearest_label_distance])

def calculate_centroids(df):
    # Transform the data
    df["embedding"] = df["review"].apply(get_embedding)

    # Get embeddings for all sentences of each type
    label_embeddings = df.groupby("kano_labels")["embedding"].apply(list).reset_index().rename(columns={"embedding": "embeddings"})

    # Get centroids for each label
    label_embeddings["centroid"] = label_embeddings["embeddings"].apply(get_centroid)

    return label_embeddings


def classify_reviews(df, label_embeddings):
    
    centroid_dict = {row["kano_labels"]: row["centroid"][0] for row in label_embeddings.to_dict(orient="records")}
    
    # Transform the data
    df["embedding"] = df["review"].apply(get_embedding)

    # Label data
    df[["nearest_label", "nearest_label_distance"]] = df["embedding"].apply(lambda x: get_nearest_label(x, centroid_dict))

    return df


In [37]:
data = pd.read_excel("../papers/Kano-Model-Classification/datasets/Stanik_dataset/DATASET_not_downsampled.xlsx", index_col=0)

# data.head()

X = data[[c for c in data.columns if c!= "kano_labels"]]
y = data["kano_labels"]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

train_data = train_X.assign(kano_labels=train_y.values) 
test_data = test_X.assign(kano_labels=test_y.values) 

centroid_df = calculate_centroids(train_data)
centroid_df[["kano_labels", "centroid"]].to_json("../data/centroids_full.json")

test_output = classify_reviews(test_data, centroid_df)
test_output[["review", "kano_labels", "nearest_label", "nearest_label_distance"]].to_csv("../data/test_labelled.csv")


TypeError: classify_reviews() missing 2 required positional arguments: 'label_embeddings' and 'df'

In [40]:
train_data.to_csv("../data/train.csv")
test_data.to_csv("../data/test.csv")
test_output = classify_reviews(test_data, centroid_df[["kano_labels", "centroid"]])
test_output[["review", "kano_labels", "nearest_label", "nearest_label_distance"]].to_csv("../data/test_labelled.csv")


In [43]:
# check accuracy, precision, recall, f1

recall_score = metrics.recall_score(test_data["kano_labels"], test_output["nearest_label"], average="weighted")
print(recall_score)


0.6814936847885777


In [45]:
# check accuracy, precision, recall, f1

recall_score = metrics.recall_score(test_data["kano_labels"], test_output["nearest_label"], average=None, labels=["B", "D", "I", "P"])
print(recall_score)

[0.78472222 0.69587629 0.71195652 0.52941176]


In [46]:
# check accuracy, precision, recall, f1

precision_score = metrics.precision_score(test_data["kano_labels"], test_output["nearest_label"], average=None, labels=["B", "D", "I", "P"])
print(precision_score)

[0.76523702 0.35064935 0.87043189 0.62148338]


In [48]:
# check accuracy, precision, recall, f1

precision_score = metrics.precision_score(test_data["kano_labels"], test_output["nearest_label"], average="weighted")
print(precision_score)

0.7273515157176651


In [None]:
# TODO:
# Precision / recall by label
# What does each category get misclassified for - can we investigate and understand?
# TFIDF + log regression approach + feature importance for explainability
# Look at how far apart the centroids are + cluster density / descriptive stats
# Sentiment analysis to see if there are significant differences in this across different categories?
