In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Load the Data
data = pd.read_csv('train.csv')

# Check for NaN values in cleaned_text, category, and sub_category columns
print("Checking for NaN values in data:")
print(data[['cleaned_text', 'category', 'sub_category']].isnull().sum())

# Handle NaN values
data['cleaned_text'] = data['cleaned_text'].fillna('')
data['category'] = data['category'].fillna('')
data['sub_category'] = data['sub_category'].fillna('')

# Combine category and sub_category into a single label
data['combined_label'] = data['category'] + ' | ' + data['sub_category']

# Split the data into training (60%), validation (20%), and test (20%)
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)  # 60% train, 40% temp
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 20% val, 20% test

# Feature and Target Selection for each dataset
X_train = train_data['cleaned_text']
y_train = train_data['combined_label']

X_val = val_data['cleaned_text']
y_val = val_data['combined_label']

X_test = test_data['cleaned_text']
y_test = test_data['combined_label']

# Define a function to train and evaluate a model
def train_and_evaluate_model(model_name, model):
    print(f"\nTraining and evaluating model: {model_name}")

    # Create a pipeline with TF-IDF vectorization and the model
    pipeline = make_pipeline(TfidfVectorizer(), model)

    # Fit the model on training data
    pipeline.fit(X_train, y_train)

    # Validate on validation data
    val_predictions = pipeline.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)
    print(f'{model_name} Validation Accuracy: {val_accuracy:.4f}')

    # Predict on test data
    predictions = pipeline.predict(X_test)

    # Calculate accuracy on test data
    accuracy = accuracy_score(y_test, predictions)
    print(f'{model_name} Test Accuracy: {accuracy:.4f}')

    # Print classification report for test data
    print("Classification Report for Test Data:")
    print(classification_report(y_test, predictions))

# Naive Bayes Classifier
train_and_evaluate_model("Naive Bayes", MultinomialNB())
