In [1]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\steli\OneDrive\Desktop\Stelios\DSAUEB\Trimester 1\PDS\A2\PDS-A2\Data\incidents_train.csv"
df = pd.read_csv(file_path)

# Initial inspection of the data
data_overview = {
    'Shape': df.shape,
    'Columns': df.columns.tolist(),
    'df Types': df.dtypes,
    'Missing Values': df.isnull().sum(),
}

print(data_overview)
# Drop the unnecessary index column
df = df.drop(columns=['Unnamed: 0'])


{'Shape': (5082, 11), 'Columns': ['Unnamed: 0', 'year', 'month', 'day', 'country', 'title', 'text', 'hazard-category', 'product-category', 'hazard', 'product'], 'df Types': Unnamed: 0           int64
year                 int64
month                int64
day                  int64
country             object
title               object
text                object
hazard-category     object
product-category    object
hazard              object
product             object
dtype: object, 'Missing Values': Unnamed: 0          0
year                0
month               0
day                 0
country             0
title               0
text                0
hazard-category     0
product-category    0
hazard              0
product             0
dtype: int64}


In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
from shutil import make_archive
import re
from nltk.corpus import stopwords
import nltk

# Download stopwords from nltk (if you haven't already)
nltk.download('stopwords')

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text (title or text) and remove stopwords
def clean_text(text):
    # Remove non-alphanumeric characters (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = ' '.join(text.split())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Load test data
test_path = r"C:\Users\steli\OneDrive\Desktop\Stelios\DSAUEB\Trimester 1\PDS\A2\PDS-A2\Data\validation_data\incidents.csv"
test_df = pd.read_csv(test_path, index_col=0)

# Clean the 'text' column
test_df['text'] = test_df['text'].apply(clean_text)

# Define relevant features and targets
features = ['year', 'month', 'day', 'country']
targets_subtask1 = ['hazard-category', 'product-category']
targets_subtask2 = ['hazard', 'product']
all_targets = targets_subtask1 + targets_subtask2

# Prepare data function for test set
def prepare_test_data(text_column):
    X = test_df[features + [text_column]]  # Include cleaned text for prediction
    return X

# Define LightGBM pipeline for text
def build_lgb_pipeline_text():
    preprocessor = ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'text'),  # Use TF-IDF for text
            ('num', StandardScaler(), ['year', 'month', 'day']),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['country'])
        ]
    )
    
    # LightGBM classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', lgb.LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=100, verbose=-1))
    ])
    return pipeline

# Train a model for each target
def train_lgb_model_for_target(target):
    text_pipeline = build_lgb_pipeline_text()
    
    # Split the data for training (use only the current target for y_train)
    X_train, _, y_train, _ = train_test_split(
        df[features + ['text']],  # Features
        df[target],  # Target for this specific task
        test_size=0.2, random_state=42
    )

    text_pipeline.fit(X_train, y_train)
    return text_pipeline

# Make predictions on the test data
def make_predictions_for_target(pipeline, X_test):
    return pipeline.predict(X_test)

# Prepare test data
test_X = prepare_test_data('text')

# Initialize a DataFrame to store all predictions
predictions_df = pd.DataFrame()

# Train models and make predictions for each target
for target in all_targets:
    print(f"Training and predicting for {target}...")
    
    # Train a separate model for each target
    target_pipeline = train_lgb_model_for_target(target)
    
    # Make predictions for the test set
    predictions_df[target] = make_predictions_for_target(target_pipeline, test_X)

# Step 2: Save predictions to a new folder
os.makedirs('./submission/', exist_ok=True)
predictions_df.to_csv('./submission/submission.csv', index=False)

# Step 3: Zip the folder for submission
make_archive('./submission', 'zip', './submission')

print("Predictions and submission.zip created successfully.")


Training and predicting for hazard-category...
Training and predicting for product-category...


In [None]:
predictions_df

Unnamed: 0,hazard-category,product-category,hazard,product
0,biological,"meat, egg and dairy products",listeria spp,cheese
1,biological,"meat, egg and dairy products",listeria spp,cheese
2,biological,"meat, egg and dairy products",salmonella,cheese
3,allergens,"nuts, nut products and seeds",peanuts and products thereof,cheese
4,biological,"meat, egg and dairy products",escherichia coli,cheese
...,...,...,...,...
560,allergens,fruits and vegetables,cereals containing gluten and products thereof,cheese
561,allergens,"dietetic foods, food supplements, fortified foods",milk and products thereof,cheese
562,foreign bodies,cereals and bakery products,plastic fragment,cheese
563,allergens,cereals and bakery products,peanuts and products thereof,cheese
