In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the file on Google Drive
train_path = '/content/drive/MyDrive/Data/incidents_train.csv'

# Load the dataset
df = pd.read_csv(train_path)
df = df.drop(columns=['Unnamed: 0'])


In [None]:
import pandas as pd
import re
import nltk
import xgboost as xgb
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, classification_report
import torch
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
from tqdm import tqdm  # Import tqdm for progress bar

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text (title or text) and remove stopwords
def clean_text(text):
    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = ' '.join(text.split())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Assuming df is your DataFrame
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)

# Define relevant features and targets
features = ['year', 'month', 'day', 'country']
targets_subtask1 = ['hazard-category', 'product-category']
targets_subtask2 = ['hazard', 'product']

# Function to prepare data for both title and text
def prepare_data(text_column):
    X = df[features + [text_column]]
    y_subtask1 = df[targets_subtask1]
    y_subtask2 = df[targets_subtask2]

    # Splitting data for both tasks
    data_splits = {}
    for target in targets_subtask1 + targets_subtask2:
        X_train, X_test, y_train, y_test = train_test_split(
            X, df[target], test_size=0.2, random_state=42
        )
        data_splits[target] = (X_train, X_test, y_train, y_test)

    return data_splits

# Prepare data for title and text
title_splits = prepare_data('title')
text_splits = prepare_data('text')

# Initialize an empty DataFrame to store F1-scores for title
f1_scores_title_df = pd.DataFrame(columns=['Task', 'F1-Score'])

# Load Pretrained BERT Model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Transformer to extract BERT embeddings
class BERTEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embeddings = []
        for text in X:
            encoded_input = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                output = self.model(**encoded_input)
                embeddings.append(output.last_hidden_state.mean(dim=1).numpy())  # Average pooling
        return torch.stack(embeddings).squeeze().numpy()

# Define XGBoost pipeline with BERT embeddings
def build_xgb_pipeline_with_bert(text_column):
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', BERTEmbedder(tokenizer, bert_model), text_column),  # Use BERT for text
            ('num', StandardScaler(), ['year', 'month', 'day']),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['country'])
        ]
    )

    # XGBoost classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(objective='multi:softmax', num_class=3, max_depth=6, learning_rate=0.1, n_estimators=100))
    ])
    return pipeline

# Train and evaluate XGBoost for title
def train_and_evaluate_xgb_title(pipeline, data_splits, targets):
    global f1_scores_title_df
    for target in tqdm(targets, desc="Training and evaluating models for Title (Targets)", unit="target"):
        print(f"\nTraining and evaluating model for {target} (Title)...")
        X_train, X_test, y_train, y_test = data_splits[target]
        print(f"Training model for {target} (Title)...")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Calculate F1-Score
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        # Collect F1-score into DataFrame using pd.concat()
        f1_scores_title_df = pd.concat([f1_scores_title_df, pd.DataFrame({'Task': [f"{target} (Title)"], 'F1-Score': [f1]})], ignore_index=True)

        # Print the classification report
        print(f"Classification Report for {target} (Title):")
        print(classification_report(y_test, y_pred, zero_division=0))  # Handle zero division gracefully

# XGBoost for title
print("XGBoost for Titles:")
title_pipeline = build_xgb_pipeline_with_bert('title')
train_and_evaluate_xgb_title(title_pipeline, title_splits, targets_subtask1 + targets_subtask2)

# Initialize an empty DataFrame to store F1-scores for text
f1_scores_text_df = pd.DataFrame(columns=['Task', 'F1-Score'])

# Define XGBoost pipeline for text
def build_xgb_pipeline_with_bert_text():
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', BERTEmbedder(tokenizer, bert_model), 'text'),  # Use BERT for text
            ('num', StandardScaler(), ['year', 'month', 'day']),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['country'])
        ]
    )

    # XGBoost classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(objective='multi:softmax', num_class=3, max_depth=6, learning_rate=0.1, n_estimators=100))
    ])
    return pipeline

# Train and evaluate XGBoost for text
def train_and_evaluate_xgb_text(pipeline, data_splits, targets):
    global f1_scores_text_df
    for target in tqdm(targets, desc="Training and evaluating models for Text (Targets)", unit="target"):
        print(f"\nTraining and evaluating model for {target} (Text)...")
        X_train, X_test, y_train, y_test = data_splits[target]
        print(f"Training model for {target} (Text)...")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Calculate F1-Score
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        # Collect F1-score into DataFrame using pd.concat()
        f1_scores_text_df = pd.concat([f1_scores_text_df, pd.DataFrame({'Task': [f"{target} (Text)"], 'F1-Score': [f1]})], ignore_index=True)

        # Print the classification report
        print(f"Classification Report for {target} (Text):")
        print(classification_report(y_test, y_pred, zero_division=0))  # Handle zero division gracefully

# XGBoost for text
print("\nXGBoost for Texts:")
text_pipeline = build_xgb_pipeline_with_bert_text()
train_and_evaluate_xgb_text(text_pipeline, text_splits, targets_subtask1 + targets_subtask2)

# Print the collected F1-scores for title
print("\nCollected F1-Scores for Title-Focused Classification:")
print(f1_scores_title_df)

# Print the collected F1-scores for text
print("\nCollected F1-Scores for Text-Focused Classification:")
print(f1_scores_text_df)

# Plotting the data
plt.figure(figsize=(10, 6))

# Plotting Title-Focused F1-scores
plt.bar(f1_scores_title_df['Task'], f1_scores_title_df['F1-Score'], label='Title-Focused')

# Plotting Text-Focused F1-scores
plt.bar(f1_scores_text_df['Task'], f1_scores_text_df['F1-Score'], label='Text-Focused')

# Adding labels and title
plt.xlabel('Task')
plt.ylabel('F1-Score')
plt.title('F1-Scores for Title-Focused vs Text-Focused Classification')
plt.xticks(rotation=45)
plt.legend()

# Displaying the plot
plt.tight_layout()
plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


XGBoost for Titles:


Training and evaluating models for Title (Targets):   0%|          | 0/4 [00:00<?, ?target/s]


Training and evaluating model for hazard-category (Title)...
Training model for hazard-category (Title)...


Training and evaluating models for Title (Targets):   0%|          | 0/4 [03:30<?, ?target/s]


KeyboardInterrupt: 