In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

# Load the pre-trained TF-IDF vectorizers for train and test data
vectorizer_train = joblib.load("tfidf_vectorizer_train.pkl")
vectorizer_test = joblib.load("tfidf_vectorizer_test.pkl")

# Load the preprocessed training and test datasets
df_train = pd.read_csv("preprocessed_train.csv")
df_test = pd.read_csv("preprocessed_test.csv")

# Ensure the 'text' and 'Class' columns exist
if "text" not in df_train.columns or "Class" not in df_train.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the train CSV file.")

if "text" not in df_test.columns or "Class" not in df_test.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the test CSV file.")

# Drop rows with NaN values in both 'text' and 'Class' columns
df_train = df_train.dropna(subset=["text", "Class"])
df_test = df_test.dropna(subset=["text", "Class"])

# Extract features and labels for training and test data
X_train = df_train["text"]  # Train data text
y_train = df_train["Class"]  # Train data labels

X_test = df_test["text"]  # Test data text
y_test = df_test["Class"]  # Test data labels

# Transform the training and test data using the corresponding vectorizers
X_train_tfidf = vectorizer_train.transform(X_train)
X_test_tfidf = vectorizer_test.transform(X_test)

# Initialize the model (e.g., Naive Bayes)
model = MultinomialNB()

# Train the model using the training data (X_train_tfidf and y_train)
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Model Evaluation Results:
Accuracy: 0.09478143397988625
Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.09      0.09      5995
           1       0.09      0.17      0.12      5989
           2       0.11      0.10      0.10      6000
           3       0.10      0.15      0.12      5993
           4       0.08      0.04      0.05      5999
           5       0.10      0.11      0.10      5999
           6       0.09      0.07      0.08      5989
           7       0.09      0.12      0.10      5996
           8       0.08      0.02      0.03      5999
           9       0.10      0.07      0.08      6000

    accuracy                           0.09     59959
   macro avg       0.09      0.09      0.09     59959
weighted avg       0.09      0.09      0.09     59959



In [2]:
from sklearn.linear_model import LogisticRegression

# Using Logistic Regression for text classification
model = LogisticRegression(max_iter=10000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Evaluation Results:
Accuracy: 0.09855067629546857
Classification Report:
               precision    recall  f1-score   support

           0       0.11      0.06      0.08      5995
           1       0.10      0.25      0.14      5989
           2       0.10      0.07      0.08      6000
           3       0.11      0.23      0.14      5993
           4       0.07      0.02      0.03      5999
           5       0.11      0.08      0.09      5999
           6       0.11      0.17      0.13      5989
           7       0.08      0.08      0.08      5996
           8       0.08      0.01      0.01      5999
           9       0.10      0.03      0.04      6000

    accuracy                           0.10     59959
   macro avg       0.09      0.10      0.08     59959
weighted avg       0.09      0.10      0.08     59959

