# LIBRARIES


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import numpy as np
from sklearn.feature_selection import chi2, SelectKBest
import compress_fasttext

# INISIASI DATASET


In [None]:
# import
SENTIMENT_DATASET_FILE_NAME = "sentiment.csv"
SENTIMENT_DATASET_FILE_PATH = f"outputs/{SENTIMENT_DATASET_FILE_NAME}"
DATA_FRAME_SENTIMENT_DATASET = pd.read_csv(SENTIMENT_DATASET_FILE_PATH)
DATA_FRAME_SENTIMENT_DATASET_LENGTH = len(DATA_FRAME_SENTIMENT_DATASET)
print(f"labelled dataset: {DATA_FRAME_SENTIMENT_DATASET_LENGTH} tweets")
display(DATA_FRAME_SENTIMENT_DATASET.tail(1))

# fastText


In [None]:
ft_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load("models/fastText/ft_small.model")

def document_to_vector(doc, model):
  words = doc.split()
  word_vectors = [model[word] for word in words if word in model]

  if len(word_vectors) > 0:
    return np.mean(word_vectors, axis=0)
  else:
    return np.zeros((model.get_dimension(),))

X_fasttext = np.array([document_to_vector(doc, ft_model) for doc in DATA_FRAME_SENTIMENT_DATASET["preprocessed_text"]])
print(f"FastText embedding shape: {X_fasttext.shape}")
print(f"Number of features in X_fasttext: {X_fasttext.shape[1]}")

# Chi Square


In [None]:
sentiment = DATA_FRAME_SENTIMENT_DATASET

# Chi-Square feature selection
k_best_features = min(1000, X_fasttext.shape[1]) 
selector = SelectKBest(chi2, k=k_best_features)
X_fasttext_normalized = np.abs(X_fasttext)
X_chi2 = selector.fit_transform(X_fasttext_normalized, sentiment["sentiment_label"])

print(f"Shape after Chi-Square: {X_chi2.shape}")

# DATA SPLIT (90% TRAIN, 10% TEST)


In [42]:
X_train,X_test,Y_train,Y_test=train_test_split(X_chi2,sentiment["sentiment_label"],test_size=0.1,random_state=21)

# KNN InSet


In [None]:
def train_evaluate_knn(k, X_train, Y_train, X_test, Y_test):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    y_pred = knn.predict(X_test)
    
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred, average='weighted')
    recall = recall_score(Y_test, y_pred, average='weighted')
    f1 = f1_score(Y_test, y_pred, average='weighted')

    print(f"| k = {k} |")
    print(f"| accuracy: {accuracy} |")
    print(f"| precision: {precision} |")
    print(f"| recall: {recall} |")
    print(f"| f1: {f1} |")
    
    cm = confusion_matrix(Y_test, y_pred, labels=knn.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
    disp.plot()
    plt.title(f"confusion matrix for k={k}")
    plt.show()
    
    return knn

knn1 = train_evaluate_knn(1, X_train, Y_train, X_test, Y_test)
knn3 = train_evaluate_knn(3, X_train, Y_train, X_test, Y_test)
knn5 = train_evaluate_knn(5, X_train, Y_train, X_test, Y_test)
knn7 = train_evaluate_knn(7, X_train, Y_train, X_test, Y_test)

# DUMP KNN MODEL AND CHI2 SELECTOR


In [44]:
def save_model(model, filename):
    with open(filename, 'wb') as model_file:
            pickle.dump(model, model_file)

# save_model(knn1, 'models/9010/knn1_9010_model.pkl')
# save_model(knn3, 'models/9010/knn3_9010_model.pkl')
# save_model(knn5, 'models/9010/knn5_9010_model.pkl')
# save_model(knn7, 'models/9010/knn7_9010_model.pkl')

# save_model(selector, 'models/chi2/selector_model.pkl')