In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

# TEST

In [3]:
test = pd.read_csv('test.txt', delimiter = ";", header = None,)
test.columns = ['Deskripsi', 'Emosi']
test.head()

Unnamed: 0,Deskripsi,Emosi
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


# TRAIN

In [4]:
train = pd.read_csv('train.txt', delimiter = ";", header = None,)
train.columns = ['Deskripsi', 'Emosi']
train.head()

Unnamed: 0,Deskripsi,Emosi
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# VAL

In [5]:
val = pd.read_csv('val.txt', delimiter = ";", header = None,)
val.columns = ['Deskripsi', 'Emosi']
val.head()

Unnamed: 0,Deskripsi,Emosi
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


# Data Cleaning


In [6]:
df_train_clean = train.copy()

## Clear duplicates

In [7]:
df_train_clean = df_train_clean.drop_duplicates(subset = ['Deskripsi'])
df_train_clean.describe()

Unnamed: 0,Deskripsi,Emosi
count,15969,15969
unique,15969,6
top,i didnt feel humiliated,joy
freq,1,5350


Jumlah dari "count" dan "unique" sudah seimbang, yang artinya tidak ada duplikat antara deskripsi satu dengan deskripsi yang lainnya.

## Stopwords

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def define_stopwords(negations=True):
#   # karena pake tf-idf jd ga perlu banyak banyak
#   # new_stopwords = ['feel','feeling','feelings','feels','felt','really','always','even','many','much','more','anything','less','something','think','thing','back','still','things','could','know','also','around','though','enough','every','everybody','everyone','someone','got','gotten','get','getting','much','would','im','id','ive','dont','couldnt','cant','neednt','havent','isnt','didnt','wont','hasnt','wasnt','wont','shant','hadnt','arent','werent','wouldnt','would']

  new_stopwords = ['feel','feeling','feelings','feels','felt','im','id','ive','dont','couldnt','cant','neednt','havent','isnt','didnt','wont','hasnt','wasnt','wont','shant','hadnt','arent','werent','wouldnt','would']
  stopwords_ = stopwords.words('english')
  stopwords_.extend(new_stopwords)
  stopwords_list = stopwords_
  if negations:
    common_negations = ['no','not','dont','don','isnt','isn','arent','aren','didnt','didn','cant','wouldnt','wouldn','weren','werent','wasn','wasnt','wont']
    stopwords_list = [word for word in stopwords_list if word not in common_negations]
  return stopwords_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ryzen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
df_train_clean_negations = df_train_clean.copy()
df_train_clean_negations['Deskripsi'] = df_train_clean['Deskripsi'].apply(lambda x: ' '.join([word for word in x.split() if word not in define_stopwords(negations=True)]))
df_train_clean_negations['Deskripsi'].head()

0                                     didnt humiliated
1    go hopeless damned hopeful around someone care...
2                    grabbing minute post greedy wrong
3         ever nostalgic fireplace know still property
4                                              grouchy
Name: Deskripsi, dtype: object

In [10]:
df_train_clean['Deskripsi'] = df_train_clean['Deskripsi'].apply(lambda x: ' '.join([word for word in x.split() if word not in define_stopwords(negations=False)]))
df_train_clean['Deskripsi'].head()

0                                           humiliated
1    go hopeless damned hopeful around someone care...
2                    grabbing minute post greedy wrong
3         ever nostalgic fireplace know still property
4                                              grouchy
Name: Deskripsi, dtype: object

Kata pada deskripsi sudah bersih dari kata negasi seperti ""don't", "not", dll. 

## Stemming

In [11]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def words_stemmer(to_be_stemmed):
  stemmed_words = []
  for word in to_be_stemmed:
    stemmed_word = stemmer.stem(word)
    stemmed_words.append(stemmed_word)
  return ' '.join(stemmed_words)

In [12]:
df_train_clean_negations['Deskripsi'] = df_train_clean_negations['Deskripsi'].apply(lambda x : words_stemmer(x.split()))
df_train_clean_negations['Deskripsi'].head()

0                                     didnt humili
1    go hopeless damn hope around someon care awak
2                     grab minut post greedi wrong
3        ever nostalg fireplac know still properti
4                                          grouchi
Name: Deskripsi, dtype: object

In [13]:
df_train_clean['Deskripsi'] = df_train_clean['Deskripsi'].apply(lambda x : words_stemmer(x.split()))
df_train_clean['Deskripsi'].head()

0                                           humili
1    go hopeless damn hope around someon care awak
2                     grab minut post greedi wrong
3        ever nostalg fireplac know still properti
4                                          grouchi
Name: Deskripsi, dtype: object

Kata yang terdapat imbuhan sudah menjadi kata dasar.

## Visualization after cleaning

menggunakan dua jenis data yang sudah dibersihkan, yakni:

1. Data bersih 100%

undefined. Data bersih tetapi masih ada kata negasi

pada tiap-tiap emosi sudah lebih mudah untuk diidentifikasi mana kata penting yang mendominasi

Kata "like" bisa digunakan sebagai kata pengisi atau kata hubung dalam kalimat yang tidak memiliki makna emosional khusus. Dalam hal ini, kemunculannya yang tinggi tidak memiliki arti khusus terkait dengan emosi yang diungkapkan. contoh : ""She looks like she's in a hurry""
Kata "like" juga bisa digunakan untuk menunjukkan perasaan positif atau bahkan sangat positif terhadap sesuatu. Dalam hal ini, kemunculannya yang tinggi bisa menunjukkan bahwa para penulis deskripsi dalam dataset tersebut sering merasa senang, terkesan, atau bahagia terhadap suatu hal yang mereka deskripsikan. contoh: "I really like spending time with my family"


Kata "like" juga bisa digunakan dalam konteks yang lebih negatif, seperti saat seseorang merasa tidak suka atau tidak senang terhadap sesuatu. Namun, hal ini mungkin tidak menjadi interpretasi utama jika kata "like" dominan di semua emosi yang diungkapkan dalam dataset tersebut. "I don't like the way he talks to me sometimes"

# Feature Extraction
Feature extraction dalam NLP (Natural Language Processing) adalah proses merubah data text menjadi *feature* numerik yang nanti bisa diinputkan ke dalam *machine learning algorithm*

## TF-IDF
Dalam *information retrieval* (pengambilan informasi yang penting), tf–idf, kependekan dari term frequency–inverse document frequency, digunakan untuk menghitung seberapa penting sebuah kata dalam sebuah dokumen di kumpulan dokumen atau yang disebut dengan corpus. tf-idf juga digunakan untuk *weigthing factor* (koefisien yang diikatkan thd sebauh *feature* atau karakteristik atau atribut atau variabel input untuk mengkategorikan level kepentingan atau pengaruh) untuk pencarian *information retrieval, text mining, user modeling*, dll. Nilai tf-idf meningkat secara proporsional dengan jumlah kata yang muncul dalam dokumen yang nanti juga pengaruhi dengan jumlah dokumen yang memiliki kata tersebut, dimana membantu untuk menyesuaikan kalau beberapa kata memang sering yang muncul di setiap dokumen seperti kata untuk keperluan grammar 

TF-IDF rumusnya adalah perkalian dari TF dan IDF
Untuk TF (Term Frequency) sendiri rumusnya adalah tf(term,document)=occurances of term in document/total words in document
dan IDF rumusnya adalah log(number of documents/documents with the term in it

In [14]:
x_train_negations = df_train_clean_negations['Deskripsi'] 
x_train_clean = df_train_clean['Deskripsi']

x_val = val['Deskripsi']
x_test = test['Deskripsi']

y_train_clean = df_train_clean['Emosi'] 

y_val = val['Emosi']
y_test = test['Emosi']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# TF-IDF Objects
vec_clean = TfidfVectorizer()
vec_negations = TfidfVectorizer()

# Train
x_train_vec_clean = vec_clean.fit_transform(x_train_clean)
x_train_vec_negations = vec_negations.fit_transform(x_train_negations)

#Val
x_val_vec_clean = vec_clean.transform(x_val)
x_val_vec_negations = vec_negations.transform(x_val)

#Test
x_test_vec_clean = vec_clean.transform(x_test)
x_test_vec_negations = vec_negations.transform(x_test)

# Sampling

## Oversampling(ADASYN)

In [16]:
%pip install imblearn==0.0

Note: you may need to restart the kernel to use updated packages.


In [17]:
from collections import Counter
from imblearn.over_sampling import ADASYN

# pastikan x_train_vec_negations adalah fitur
# dan y_train_clean adalah label
adasyn_negations = ADASYN(random_state=42)
X_train_adasyn_negations, y_train_adasyn_negations = adasyn_negations.fit_resample(x_train_vec_negations, y_train_clean)

# menggunakan fungsi Counter pada label yang telah di-resample
print('Resampled dataset shape %s' % Counter(y_train_adasyn_negations))

Resampled dataset shape Counter({'anger': 5577, 'fear': 5568, 'love': 5399, 'surprise': 5387, 'joy': 5350, 'sadness': 4786})


In [18]:
def scatter_plot(data, xlabel, ylabel,title):
    counter_data = Counter(data)
    labels, values = zip(*counter_data.items())
    df = pd.DataFrame({xlabel: labels, ylabel: values})
    sns.barplot(x=xlabel, y=ylabel, data=df)

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

In [19]:
# pastikan x_train_vec_clean adalah fitur
# dan y_train_clean adalah label
adasyn_clean = ADASYN(random_state=42)
X_train_adasyn_clean, y_train_adasyn_clean = adasyn_clean.fit_resample(x_train_vec_clean, y_train_clean)

# menggunakan fungsi Counter pada label yang telah di-resample
print('Resampled dataset shape %s' % Counter(y_train_adasyn_clean))

Resampled dataset shape Counter({'fear': 5486, 'anger': 5471, 'joy': 5350, 'surprise': 5349, 'love': 5290, 'sadness': 4789})


## NCR

In [20]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.preprocessing import LabelEncoder

le_y = LabelEncoder()
y_train_clean_encoded = le_y.fit_transform(y_train_clean)

ncr_negations = NeighbourhoodCleaningRule(n_jobs=-1)
X_train_ncr_negations, y_train_ncr_negations = ncr_negations.fit_resample(x_train_vec_negations, y_train_clean_encoded)

print('Resampled dataset shape %s' % Counter(y_train_ncr_negations))

Resampled dataset shape Counter({2: 4172, 4: 3644, 0: 1644, 1: 1378, 3: 595, 5: 568})


In [21]:
y_train_ncr_negations = le_y.inverse_transform(y_train_ncr_negations)
# scatter_plot(y_train_ncr_negations,"Emosi","Jumlah","NCR dataset with negations")

In [22]:
le_y = LabelEncoder()
y_train_clean_encoded = le_y.fit_transform(y_train_clean)

ncr_clean = NeighbourhoodCleaningRule(n_jobs=-1)
X_train_ncr_clean, y_train_ncr_clean = ncr_clean.fit_resample(x_train_vec_clean, y_train_clean_encoded)

print('Resampled dataset shape %s' % Counter(y_train_ncr_clean))

Resampled dataset shape Counter({2: 4218, 4: 3660, 0: 1654, 1: 1390, 3: 623, 5: 568})


In [23]:
y_train_ncr_clean = le_y.inverse_transform(y_train_ncr_clean)
# scatter_plot(y_train_ncr_clean,"Emosi","Jumlah","NCR dataset without negations")

## NearMiss Undersampling

In [24]:
from imblearn.under_sampling import NearMiss
nm = NearMiss(n_jobs=-1)
X_train_nm_negations, y_train_nm_negations = nm.fit_resample(x_train_vec_negations, y_train_clean)

# menggunakan fungsi Counter pada label yang telah di-resample
print('Resampled dataset shape %s' % Counter(y_train_nm_negations))

Resampled dataset shape Counter({'anger': 568, 'fear': 568, 'joy': 568, 'love': 568, 'sadness': 568, 'surprise': 568})


In [25]:
from imblearn.under_sampling import NearMiss
nm = NearMiss(n_jobs=-1)
X_train_nm_clean, y_train_nm_clean = nm.fit_resample(x_train_vec_clean, y_train_clean)

# menggunakan fungsi Counter pada label yang telah di-resample
print('Resampled dataset shape %s' % Counter(y_train_nm_clean))

Resampled dataset shape Counter({'anger': 568, 'fear': 568, 'joy': 568, 'love': 568, 'sadness': 568, 'surprise': 568})


In [26]:
# scatter_plot(y_train_nm_clean, "Emosi", "Jumlah", "NearMiss undersampled dataset without negations")

In [27]:
# pca_2d_viz(X_train_nm_clean, y_train_nm_clean)

In [28]:
# pca_3d_viz(X_train_nm_clean, y_train_nm_clean)

# Machine Learning Algorithm

## SVM
In machine learning, support vector machines (SVMs, also support vector networks) are supervised learning models with associated learning algorithms that analyze data for classification and regression analysis. Given a set of training examples, each marked as belonging to one of two categories, an SVM training algorithm builds a model that assigns new examples to one category or the other, making it a non-probabilistic binary linear classifier (although methods such as Platt scaling exist to use SVM in a probabilistic classification setting). SVM maps training examples to points in space so as to maximise the width of the gap between the two categories. New examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall.

In [29]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

class SVMModel:
    def __init__(self, kernel='linear', C=1.0, gamma='auto', degree=3, class_weight=None, random_state=None):
        self.kernel = kernel
        self.C = C
        self.degree = degree
        self.class_weight = class_weight
        self.random_state = random_state
        self.model = SVC(kernel=self.kernel, C=self.C, gamma=gamma, degree=self.degree, class_weight=self.class_weight, random_state=self.random_state)

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return report

## Naive Bayes
In statistics, naive Bayes classifiers are a family of simple "probabilistic classifiers" based on applying Bayes' theorem with strong (naive) independence assumptions between the features (see Bayes classifier). They are among the simplest Bayesian network models, but coupled with kernel density estimation, they can achieve high accuracy levels.

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

class NaiveBayes:
    
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.model = MultinomialNB(alpha=self.alpha)
        
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
    def predict(self, X_test):
        return self.model.predict(X_test)
    
    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return report

## Random Forest
Random forests or random decision forests is an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. For classification tasks, the output of the random forest is the class selected by most trees. For regression tasks, the mean or average prediction of the individual trees is returned.Random decision forests correct for decision trees' habit of overfitting to their training set.: 587–588  Random forests generally outperform decision trees, but their accuracy is lower than gradient boosted trees. However, data characteristics can affect their performance.

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', random_state=None):
        self.clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)

    def fit(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, X_test):
        return self.clf.predict(X_test)

    def predict_proba(self, X_test):
        return self.clf.predict_proba(X_test)

    def get_feature_importances(self, feature_names):
        importances = self.clf.feature_importances_
        indices = np.argsort(importances)[::-1]
        feature_importances = [(feature_names[i], importances[i]) for i in indices]
        return feature_importances

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return report

## K-NN
In statistics, the k-nearest neighbors algorithm (k-NN) is a non-parametric supervised learning method first developed by Evelyn Fix and Joseph Hodges in 1951,[1] and later expanded by Thomas Cover.[2] It is used for classification and regression. In both cases, the input consists of the k closest training examples in a data set. The output depends on whether k-NN is used for classification or regression:

In [32]:
from sklearn.neighbors import KNeighborsClassifier
class KNN:
    def __init__(self, k=5, metric='minkowski', p=2, weights='uniform'):
        self.model = KNeighborsClassifier(n_neighbors=k, metric=metric, p=p, weights=weights)
    
    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
    
    def predict(self, X_test):
        return self.model.predict(X_test)
    
    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        return report

# Hyperparameter Tuning
Hyperparameter tuning adalah proses memilih combinasi terbaik hyperparameter untuk sebuah model machine learning agar mendapatkan performa terbaik dalam suatu dataset
Hyperparameter adalah parameter dari model yang tidak dipelajari selama training, tapi ditentukan sebelum proses training.

## SVM Hyperparameter Tuning

In [33]:
%pip install optuna==3.1.0

Collecting optuna==3.1.0
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
     ---------------------------------------- 0.0/365.3 kB ? eta -:--:--
     --- ------------------------------------ 30.7/365.3 kB ? eta -:--:--
     ----------- -------------------------- 112.6/365.3 kB 1.1 MB/s eta 0:00:01
     ------------------ ------------------- 174.1/365.3 kB 1.3 MB/s eta 0:00:01
     --------------------- ---------------- 204.8/365.3 kB 1.0 MB/s eta 0:00:01
     ----------------------------- -------- 286.7/365.3 kB 1.2 MB/s eta 0:00:01
     -------------------------------------- 365.3/365.3 kB 1.3 MB/s eta 0:00:00
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
     ---------------------------------------- 0.0/212.2 kB ? eta -:--:--
     ------------- ------------------------- 71.7/212.2 kB 2.0 MB/s eta 0:00:01
     ---------------------------------- --- 194.6/212.2 kB 2.4 MB/s eta 0:00:01
     -------------------------------------- 212.2/212.2 kB 2.

In [36]:
import optuna
from sklearn.metrics import precision_score

def objective(trial, x_train, y_train, x_val, y_val):
    svm_c = trial.suggest_float("C", 1e-4, 10, log=True)
    svm_kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"])
    svm_gamma = trial.suggest_categorical("gamma", ["scale", "auto"]) if svm_kernel in ["rbf", "poly", "sigmoid"] else "scale"
    svm_model = SVMModel(C=svm_c, kernel=svm_kernel, gamma=svm_gamma)
    svm_model.fit(x_train, y_train)
    y_pred = svm_model.predict(x_val)
    precision = precision_score(y_val, y_pred, average='macro')
    return precision

# Define the datasets
datasets = [
    (X_train_adasyn_negations, y_train_adasyn_negations, x_val_vec_negations, y_val, "ADASYN Negations"),
    (X_train_adasyn_clean, y_train_adasyn_clean, x_val_vec_clean, y_val, "ADASYN Clean"),
    (x_train_vec_negations, y_train_clean, x_val_vec_negations, y_val, "Original Negations"),
    (x_train_vec_clean, y_train_clean, x_val_vec_clean, y_val, "Original Clean"),
    (X_train_ncr_negations, y_train_ncr_negations, x_val_vec_negations, y_val, "NCR Negations"),
    (X_train_ncr_clean, y_train_ncr_clean, x_val_vec_clean, y_val, "NCR Clean"),
    (X_train_nm_negations, y_train_nm_negations, x_val_vec_negations, y_val, "NearMiss Negations"),
    (X_train_nm_clean, y_train_nm_clean, x_val_vec_clean, y_val, "NearMiss Clean")
]

best_hyperparams = {}
for x_train, y_train, x_val, y_val, dataset_name in datasets:
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, x_train, y_train, x_val, y_val), n_trials=100)
    best_hyperparams[dataset_name] = study.best_params
    print(f"Best hyperparameters for {dataset_name}: {best_hyperparams[dataset_name]}")


[32m[I 2023-03-14 20:51:29,005][0m A new study created in memory with name: no-name-63f6daae-af42-4357-8dbb-c0bce41c300c[0m


In [None]:
import pickle

# Train the models with the best hyperparameters and save them
for dataset_name, hyperparams in best_hyperparams.items():
    x_train, y_train, _, _, _ = next(filter(lambda d: d[4] == dataset_name, datasets))
    svm_model = SVMModel(C=hyperparams['C'], kernel=hyperparams['kernel'], gamma=hyperparams['gamma'])
    svm_model.fit(x_train, y_train)
    # Evaluate the model on test set, assuming you have a separate test set
    y_pred = svm_model.predict(x_test)
    precision = precision_score(y_test, y_pred, average='macro')
    print(f"Precision for {dataset_name} with best hyperparameters: {precision}")
    # Save the trained model
    filename = f"{dataset_name}_svm_model.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(svm_model, f)
    print(f"Saved model for {dataset_name} in {filename}")

In [None]:
import optuna
import sqlite3
from sklearn.metrics import precision_score

def objective(trial, x_train, y_train, x_val, y_val):
    svm_c = trial.suggest_float("C", 1e-4, 10, log=True)
    svm_kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"])
    svm_gamma = trial.suggest_categorical("gamma", ["scale", "auto"]) if svm_kernel in ["rbf", "poly", "sigmoid"] else "scale"
    svm_model = SVMModel(C=svm_c, kernel=svm_kernel, gamma=svm_gamma)
    svm_model.fit(x_train, y_train)
    y_pred = svm_model.predict(x_val)
    precision = precision_score(y_val, y_pred, average='weighted')
    return precision

datasets = []
best_hyperparams = {}

# Connect to SQLite database
conn = sqlite3.connect('example.db')

for x_train, y_train, x_val, y_val, dataset_name in datasets:
    # Create Optuna study with SQLite storage
    study = optuna.create_study(direction="maximize", storage=optuna.storages.RDBStorage(url='sqlite:///example.db'))
    study.optimize(lambda trial: objective(trial, x_train, y_train, x_val, y_val), n_trials=100)
    best_hyperparams[dataset_name] = study.best_params
    print(f"Best hyperparameters for {dataset_name}: {best_hyperparams[dataset_name]}")

for dataset_name, hyperparams in best_hyperparams.items():
    x_train, y_train, _, _, _ = next(filter(lambda d: d[4] == dataset_name, datasets))
    svm_model = SVMModel(C=hyperparams['C'], kernel=hyperparams['kernel'], gamma=hyperparams['gamma'])
    svm_model.fit(x_train, y_train)
    # Evaluate the model on test set, assuming you have a separate test set
    y_pred = svm_model.predict(x_test)
    precision = precision_score(y_test, y_pred, average='macro')
    print(f"Precision for {dataset_name} with best hyperparameters: {precision}")
    # Save the trained model
    filename = f"{dataset_name}_svm_model.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(svm_model, f)
    print(f"Saved model for {dataset_name} in {filename}")

# Close database connection
conn.close()

In [None]:
# best_params_og_clean = {'C': 0.11919664233529602, 'kernel': 'rbf', 'gamma': 'scale'}
# svm_og_clean_tuned = SVMModel(C = 0.11919664233529602, kernel='rbf', gamma='scale')
# svm_og_clean_tuned.fit(x_train_vec_clean, y_train_clean)

In [None]:
# svm_og_clean_tuned_report = svm_og_clean_tuned.evaluate(x_val_vec_clean,y_val)
# svm_og_clean_tuned_report

In [None]:
# best_params_og_negations

In [None]:
# best_params = study.best_params
# best_params

## K-NN

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=af4a7c96-d103-410f-b422-a3ffefd75e74' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>