In [1]:
import pandas as pd
import pickle
import os
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
model_path = '../pickle_Crossvalidation/SupportVectorMachine/mistral_model.pkl'

In [3]:
data = pd.read_csv('../../../../preprocessing/StorePreprocessed/Mistralcsv.csv')

In [4]:
X_train = data["text"]
y_train = data["label"]

In [5]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4), max_features=11000)  # dynamic range of features

In [6]:
# Remove NaN values from the split data
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_train = vectorizer.fit_transform(X_train)

In [7]:
param_grid_svm = {
    'C': [6.0, 2.0, 1.0, 0.95, 0.9, 0.8]
}

In [8]:
svm = SVC(kernel="linear")

In [9]:
if os.path.exists(model_path):

    with open(model_path, 'rb') as file:
        svm_best = pickle.load(file)
else:
    grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
    grid_svm.fit(X_train, y_train)
    svm_best = grid_svm.best_estimator_


In [10]:
with open(model_path, 'wb') as file:
    pickle.dump(svm_best, file)