In [None]:
zsniimport pandas as pd

# Load train and test data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [None]:
# Import necessary libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text (remove stop words and lemmatize)
def clean_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove punctuation and stop words, then apply lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens
                      if word.lower() not in stop_words and word not in string.punctuation]

    # Join tokens back to a single string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Apply the cleaning function to 'title' columns in train and test data
train_data['cleaned_title'] = train_data['Title'].apply(clean_text)
test_data['cleaned_title'] = test_data['Title'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, make_scorer
from sklearn.preprocessing import LabelBinarizer
import numpy as np

# Step 1: Split data into training and validation sets
X_full = train_data['cleaned_title']
y_full = train_data['Label'] - 1  # Adjust labels to start from 0

X_train_raw, X_val_raw, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [None]:
# Step 2: Vectorize the data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Use bigrams if preferred
X_train = vectorizer.fit_transform(X_train_raw)
X_val = vectorizer.transform(X_val_raw)
X_test = vectorizer.transform(test_data['cleaned_title'])

In [None]:
# Step 3: Perform Grid Search to find the best alpha using AUC as the scoring metric
param_grid = {'alpha': [0.1, 0.5, 1.0]}
auc_scorer = make_scorer(roc_auc_score, multi_class='ovr', needs_proba=True)
nb_model = MultinomialNB()

grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, scoring=auc_scorer, cv=3, verbose=1)
grid_search.fit(X_train, y_train)

# Use the best model from GridSearchCV
best_nb_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best AUC Score from Grid Search:", grid_search.best_score_)

# Step 4: Evaluate on the validation set using the best model
y_val_pred = best_nb_model.predict(X_val)
y_val_proba = best_nb_model.predict_proba(X_val)  # Get probabilities for AUC scoring

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy with Naive Bayes: {val_accuracy}")
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Step 4a: Ensure all classes are represented in y_val_proba
num_classes = len(np.unique(y_train))  # Get total number of classes from y_train
if y_val_proba.shape[1] < num_classes:
    # Add missing class columns with zeros
    missing_classes = num_classes - y_val_proba.shape[1]
    y_val_proba = np.hstack([y_val_proba, np.zeros((y_val_proba.shape[0], missing_classes))])

# Binarize y_val for per-class AUC calculation with explicit number of classes
lb = LabelBinarizer()
lb.fit(range(num_classes))  # Fit all classes
y_val_binarized = lb.transform(y_val)

# Calculate AUC score for each class and skip single-class cases
auc_scores = []
for i in range(num_classes):
    if len(np.unique(y_val_binarized[:, i])) > 1:  # Check if the class has both positive and negative samples
        auc = roc_auc_score(y_val_binarized[:, i], y_val_proba[:, i])
        auc_scores.append(auc)
        print(f"AUC for class {i + 1}: {auc}")
    else:
        print(f"Skipping AUC for class {i + 1} due to single-class issue.")

# Calculate the average AUC score if there are valid scores
if auc_scores:
    average_auc = np.mean(auc_scores)
    print(f"Average AUC Score across all classes: {average_auc}")
else:
    print("AUC calculation skipped due to single-class issue in all classes.")


Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Parameters: {'alpha': 0.1}
Best AUC Score from Grid Search: 0.8076588784948955
Validation Accuracy with Naive Bayes: 0.55
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.62      0.68      0.65        19
           2       0.57      0.94      0.71        18
           3       0.50      0.43      0.46        14
           5       0.80      0.67      0.73         6
           6       0.00      0.00      0.00         1
           7       1.00      0.25      0.40         4
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         3
          10       0.40      0.40      0.40         5
          11       0.46      0.52      0.49        21
          12       0.00      0.00      0.00         3

    accuracy                           0.55       100
   macro avg       0.45      0.3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Step 5: Predict on the test set
y_test_proba = best_nb_model.predict_proba(X_test)
y_test_pred = np.argmax(y_test_proba, axis=1)  # Select class with the highest probability

# Adjust back to original label range and save predictions
test_data['Predicted_Label'] = y_test_pred + 1  # Convert back to 1-based labels
test_data.to_csv('nb_predictions.csv', index=False)
print(test_data[['cleaned_title', 'Predicted_Label']].head())

                                       cleaned_title  Predicted_Label
0               engineer fined huge fire napier port                2
1  ictsi reach 30 year lease extension baltic con...               12
2  dhl ocean freight rate moving towards manageab...                2
3  indonesian stuck vessel kaohsiung set return h...                4
4      body found container sent chattogram malaysia                4


In [None]:
# Load the prediction and test data
nb_predictions_df = pd.read_csv('nb_predictions.csv')
test_data_df = pd.read_excel('Test Data (with labels).xlsx')

# Rename columns in test data for clarity
test_data_df.columns = ['Date', 'URL', 'Title', 'Source', 'Country', 'Actual_Label']

# Sort and merge both dataframes on the 'URL' column for alignment
merged_df = pd.merge(nb_predictions_df, test_data_df, on='URL', suffixes=('_pred', '_actual'))

# Calculate the accuracy score
accuracy = (merged_df['Predicted_Label'] == merged_df['Actual_Label']).mean()

# Print the accuracy
print(f"Accuracy Score: {accuracy * 100:.2f}%")

Accuracy Score: 46.95%
