In [1]:
import pandas as pd
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
model_path = '../pickle_Crossvalidation/LogisticRegression/llama3_model.pkl'

In [3]:
data = pd.read_csv('../../../../preprocessing/StorePreprocessed/Llama3csv.csv')

In [4]:
# use 100% of training data
X_train = data["text"]
y_train = data["label"]

In [5]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4), max_features=11000)  # dynamic range of features

In [6]:
# Remove NaN values from the split data
X_train = X_train.dropna()
y_train = y_train[X_train.index]
X_train = vectorizer.fit_transform(X_train)

In [7]:
#define C-values for gridsearch
param_grid_lr = {
    'C': [6.0, 2.0, 1.0, 0.95, 0.9, 0.8]
}

In [8]:
lr = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.5)

In [9]:
if os.path.exists(model_path):
    #load model
    with open(model_path, 'rb') as file:
        lr_best = pickle.load(file)
else:
    grid_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, cv=5, scoring='accuracy')
    grid_lr.fit(X_train, y_train)
    lr_best = grid_lr.best_estimator_

In [10]:
with open(model_path, 'wb') as file:
    pickle.dump(lr_best, file)