Clean Dataset - Remove stop word + Apply Lemmatization

In [1]:
import pandas as pd

# Load train and test data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [2]:
train_data.head()


Unnamed: 0,Date,URL,Title,Source,Country,Label
0,20240815T010000Z,https://borneobulletin.com.bn/explosions-repor...,Explosions reported near two ships off Yemen :...,borneobulletin.com.bn,Brunei,2
1,20240716T194500Z,https://www.hindustantimes.com/india-news/crew...,"Crew , including 13 Indians , still missing af...",hindustantimes.com,India,2
2,20240809T100000Z,https://www.yahoo.com/news/multiple-attacks-ta...,Multiple attacks target merchant ship off Yeme...,yahoo.com,United States,3
3,20240717T041500Z,https://timesofoman.com/article/147862-oil-tan...,Oil tanker with 13 Indians on board sinks off ...,timesofoman.com,Oman,2
4,20240812T201500Z,https://menafn.com/1108546043/Multiple-Attacks...,Multiple Attacks Target Merchant Ship Off Yemen,menafn.com,Qatar,3


In [3]:
test_data.head()

Unnamed: 0,Date,URL,Title,Source,Country
0,20221207T020000Z,https://www.rnz.co.nz/news/national/480280/eng...,Engineer fined over huge fire at Napier Port,rnz.co.nz,
1,20221221T150000Z,https://www.ship-technology.com/news/ictsi-lea...,ICTSI reaches 30 - year lease extension for Ba...,ship-technology.com,United States
2,20221018T084500Z,https://www.malaymail.com/news/money/mediaoutr...,DHL : Ocean freight rate moving towards manage...,malaymail.com,United States
3,20221028T151500Z,https://focustaiwan.tw/society/202210280021,Indonesians stuck on vessel in Kaohsiung set t...,focustaiwan.tw,Taiwan
4,20221018T104500Z,https://bdnews24.com/bangladesh/0ggpvbnije,Body found in container sent from Chattogram t...,bdnews24.com,Bangladesh


In [4]:
# Import necessary libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text (remove stop words and lemmatize)
def clean_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove punctuation and stop words, then apply lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens 
                      if word.lower() not in stop_words and word not in string.punctuation]
    
    # Join tokens back to a single string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Apply the cleaning function to 'title' columns in train and test data
train_data['cleaned_title'] = train_data['Title'].apply(clean_text)
test_data['cleaned_title'] = test_data['Title'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aaron\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
train_data.head()

Unnamed: 0,Date,URL,Title,Source,Country,Label,cleaned_title
0,20240815T010000Z,https://borneobulletin.com.bn/explosions-repor...,Explosions reported near two ships off Yemen :...,borneobulletin.com.bn,Brunei,2,explosion reported near two ship yemen securit...
1,20240716T194500Z,https://www.hindustantimes.com/india-news/crew...,"Crew , including 13 Indians , still missing af...",hindustantimes.com,India,2,crew including 13 indian still missing oil tan...
2,20240809T100000Z,https://www.yahoo.com/news/multiple-attacks-ta...,Multiple attacks target merchant ship off Yeme...,yahoo.com,United States,3,multiple attack target merchant ship yemen uni...
3,20240717T041500Z,https://timesofoman.com/article/147862-oil-tan...,Oil tanker with 13 Indians on board sinks off ...,timesofoman.com,Oman,2,oil tanker 13 indian board sink oman coast
4,20240812T201500Z,https://menafn.com/1108546043/Multiple-Attacks...,Multiple Attacks Target Merchant Ship Off Yemen,menafn.com,Qatar,3,multiple attack target merchant ship yemen


In [6]:
test_data.head()

Unnamed: 0,Date,URL,Title,Source,Country,cleaned_title
0,20221207T020000Z,https://www.rnz.co.nz/news/national/480280/eng...,Engineer fined over huge fire at Napier Port,rnz.co.nz,,engineer fined huge fire napier port
1,20221221T150000Z,https://www.ship-technology.com/news/ictsi-lea...,ICTSI reaches 30 - year lease extension for Ba...,ship-technology.com,United States,ictsi reach 30 year lease extension baltic con...
2,20221018T084500Z,https://www.malaymail.com/news/money/mediaoutr...,DHL : Ocean freight rate moving towards manage...,malaymail.com,United States,dhl ocean freight rate moving towards manageab...
3,20221028T151500Z,https://focustaiwan.tw/society/202210280021,Indonesians stuck on vessel in Kaohsiung set t...,focustaiwan.tw,Taiwan,indonesian stuck vessel kaohsiung set return h...
4,20221018T104500Z,https://bdnews24.com/bangladesh/0ggpvbnije,Body found in container sent from Chattogram t...,bdnews24.com,Bangladesh,body found container sent chattogram malaysia


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import numpy as np

# Step 1: Split train_data into train and validation sets
X_full = train_data['cleaned_title']
y_full = train_data['Label'] - 1  # Adjust labels to start from 0

# Create training and validation sets
X_train_raw, X_val_raw, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42)



In [8]:
# Step 2: Vectorize the data
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_raw)
X_val = vectorizer.transform(X_val_raw)
X_test = vectorizer.transform(test_data['cleaned_title'])

In [9]:
# Step 3: Define parameter grid and perform grid search with XGBoost
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier()

# Set up GridSearchCV with AUC scoring for multiclass (One-vs-Rest)
grid_search = GridSearchCV(
    estimator=xgb_model, 
    param_grid=param_grid, 
    scoring='roc_auc_ovr', 
    cv=3, 
    n_jobs=-1, 
    verbose=1
)

# Fit grid search on training data
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best AUC Score:", grid_search.best_score_)

# Use the best model from grid search
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best AUC Score: 0.7251508208531678


In [10]:
# Step 4: Predict probabilities on the validation set
y_proba_val = best_model.predict_proba(X_val)

# Step 5: Find optimal thresholds for each class using the validation set
class_thresholds = {}
num_classes = y_proba_val.shape[1]

print("Finding best thresholds for each class...")
for class_index in range(num_classes):
    best_threshold = 0.5  # Default threshold
    best_auc = 0
    thresholds = [i * 0.05 for i in range(1, 20)]
    
    # Convert y_val to binary for the current class
    y_val_binary = (y_val == class_index).astype(int)
    
    # Check if y_val_binary has only one unique value (all 0s or all 1s)
    if len(np.unique(y_val_binary)) == 1:
        print(f"Skipping class {class_index + 1} due to lack of positive/negative examples.")
        class_thresholds[class_index] = best_threshold
        continue
    
    # Loop through possible thresholds for this class
    for threshold in thresholds:
        y_pred_threshold = (y_proba_val[:, class_index] >= threshold).astype(int)
        auc = roc_auc_score(y_val_binary, y_pred_threshold)
        
        if auc > best_auc:
            best_auc = auc
            best_threshold = threshold
    
    # Store the best threshold for the current class
    class_thresholds[class_index] = best_threshold
    print(f"Class {class_index + 1}: Best Threshold = {best_threshold}, Best AUC = {best_auc}")


Finding best thresholds for each class...
Class 1: Best Threshold = 0.15000000000000002, Best AUC = 0.625
Class 2: Best Threshold = 0.2, Best AUC = 0.7959714100064977
Class 3: Best Threshold = 0.15000000000000002, Best AUC = 0.9146341463414633
Class 4: Best Threshold = 0.15000000000000002, Best AUC = 0.8463455149501661
Skipping class 5 due to lack of positive/negative examples.
Class 6: Best Threshold = 0.05, Best AUC = 0.8953900709219859
Class 7: Best Threshold = 0.1, Best AUC = 0.5
Class 8: Best Threshold = 0.25, Best AUC = 0.5
Class 9: Best Threshold = 0.30000000000000004, Best AUC = 0.5
Class 10: Best Threshold = 0.1, Best AUC = 0.6666666666666666
Class 11: Best Threshold = 0.2, Best AUC = 0.7
Class 12: Best Threshold = 0.1, Best AUC = 0.7673297166968053
Class 13: Best Threshold = 0.05, Best AUC = 0.7920962199312714


In [11]:
# Step 6: Predict labels on the test set using class-specific thresholds
y_proba_test = best_model.predict_proba(X_test)
y_pred_test = []

for i in range(y_proba_test.shape[0]):
    class_predictions = []
    
    for class_index in range(num_classes):
        # Check if probability exceeds the threshold for this class
        if y_proba_test[i, class_index] >= class_thresholds[class_index]:
            class_predictions.append((class_index, y_proba_test[i, class_index]))
    
    # If multiple classes exceed thresholds, pick the one with the highest probability
    if class_predictions:
        best_class = max(class_predictions, key=lambda x: x[1])[0]
    else:
        # If no class exceeds thresholds, pick the class with the highest probability
        best_class = np.argmax(y_proba_test[i, :])
    
    y_pred_test.append(best_class + 1)  # Convert back to original label range (1-13)


In [2]:
# Add predictions to the test data
test_data['cleaned_title'] = test_data['cleaned_title']  # Assuming 'cleaned_title' is already in test_data
test_data['Predicted_Label'] = y_pred_test  # Add the predicted labels

# Save the entire test_data with all original columns and the new columns to predictions.csv
test_data.to_csv('xgb_predictions.csv', index=False)

# Display the first few rows to verify
print(test_data.head())


NameError: name 'test_data' is not defined