In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load preprocessed training and test datasets
df_train = pd.read_csv("preprocessed_train.csv")
df_test = pd.read_csv("preprocessed_test.csv")

# Ensure required columns exist
for df, name in [(df_train, "train"), (df_test, "test")]:
    if "text" not in df.columns or "Class" not in df.columns:
        raise ValueError(f"Columns 'text' and 'Class' not found in {name} CSV file.")

# Drop rows with NaN values in 'text' and 'Class'
df_train.dropna(subset=["text", "Class"], inplace=True)
df_test.dropna(subset=["text", "Class"], inplace=True)

# Combine text data from both datasets
all_text = pd.concat([df_train["text"], df_test["text"]], axis=0)

# Initialize and fit TF-IDF vectorizer on both train & test text data
vectorizer = TfidfVectorizer(max_features=100000)
vectorizer.fit(all_text)

# Transform train and test separately
X_train_tfidf = vectorizer.transform(df_train["text"])
X_test_tfidf = vectorizer.transform(df_test["text"])

# Save the trained vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [2]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB

# Load the correct TF-IDF vectorizer
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Load the preprocessed training and test datasets
df_train = pd.read_csv("preprocessed_train.csv")
df_test = pd.read_csv("preprocessed_test.csv")

# Ensure the 'text' and 'Class' columns exist
if "text" not in df_train.columns or "Class" not in df_train.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the train CSV file.")

if "text" not in df_test.columns or "Class" not in df_test.columns:
    raise ValueError("Columns 'text' and 'Class' not found in the test CSV file.")

# Drop rows with NaN values in both 'text' and 'Class' columns
df_train = df_train.dropna(subset=["text", "Class"])
df_test = df_test.dropna(subset=["text", "Class"])

# Extract labels for training and test data
y_train = df_train["Class"]
y_test = df_test["Class"]

# Transform text using the same TF-IDF vectorizer
X_train_tfidf = vectorizer.transform(df_train["text"])
X_test_tfidf = vectorizer.transform(df_test["text"])

# Initialize the model (e.g., Naive Bayes)
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Evaluation Results:
Accuracy: 0.7023738080949523
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.54      0.57      6000
           1       0.73      0.74      0.73      5994
           2       0.73      0.80      0.76      6000
           3       0.60      0.45      0.52      5998
           4       0.82      0.87      0.84      6000
           5       0.88      0.84      0.86      6000
           6       0.58      0.52      0.55      5998
           7       0.68      0.70      0.69      5999
           8       0.62      0.81      0.70      5999
           9       0.75      0.76      0.75      6000

    accuracy                           0.70     59988
   macro avg       0.70      0.70      0.70     59988
weighted avg       0.70      0.70      0.70     59988



In [3]:
from sklearn.linear_model import LogisticRegression

# Using Logistic Regression for text classification
model = LogisticRegression(max_iter=10000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Model Evaluation Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Evaluation Results:
Accuracy: 0.7184770287390811
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.57      0.59      6000
           1       0.71      0.77      0.74      5994
           2       0.76      0.80      0.78      6000
           3       0.57      0.52      0.54      5998
           4       0.84      0.87      0.86      6000
           5       0.88      0.89      0.88      6000
           6       0.60      0.51      0.55      5998
           7       0.69      0.73      0.71      5999
           8       0.71      0.76      0.74      5999
           9       0.77      0.76      0.77      6000

    accuracy                           0.72     59988
   macro avg       0.71      0.72      0.72     59988
weighted avg       0.71      0.72      0.72     59988

