In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
model_path = '../pickle_Crossvalidation/RandomForest/gptNeo_model.pkl'

In [3]:
# Load data 
data = pd.read_csv('../../../../preprocessing/StorePreprocessed/GPTNeocsv.csv')

In [4]:
X_train = data["text"]
y_train = data["label"]

In [5]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4), max_features=11000)  # dynamic range of features

In [6]:
# Remove NaN values from the split data
X_train = X_train.dropna()
y_train = y_train[X_train.index]
X_train = vectorizer.fit_transform(X_train)

In [7]:
param_grid_rf = {
    'min_samples_split': [8, 32, 128],
    'max_features': [int(np.sqrt(X_train.shape[1])), int(0.02 * X_train.shape[1]), int(0.04 * X_train.shape[1]), int(0.06 * X_train.shape[1])]
}

In [8]:
rf = RandomForestClassifier(criterion='gini', oob_score=True)

In [9]:
# Grid search for Random Forest
if os.path.exists(model_path) and 1==2:

    with open(model_path, 'rb') as file:
        rf_best = pickle.load(file)
else:
    grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
    grid_rf.fit(X_train, y_train)
    rf_best = grid_rf.best_estimator_

In [10]:
with open(model_path, 'wb') as file:
    pickle.dump(rf_best, file)