<a href="https://colab.research.google.com/github/strateg17/fake-news/blob/dev/Balaka_model_selection_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import optuna
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from transformers import BertTokenizer, BertModel
import torch
from typing import Tuple, Dict, Callable, Any

def load_and_prepare_data() -> Tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
    """
    Load and prepare data for training and testing.

    Returns:
        X_train, X_test, y_train, y_test: Training and testing features and labels.
    """
    # Load the data
    df_test = pd.read_parquet('/content/english_fact_test.parquet', columns=['claim', 'label'])
    df_train = pd.read_parquet('/content/english_fact_train.parquet', columns=['claim', 'label'])

    # Combine datasets
    df_combined = pd.concat([df_test, df_train], ignore_index=True)

    # Map labels to numeric values
    label_mapping = {'Supported': 0, 'Refuted': 1}
    df_combined['label'] = df_combined['label'].map(label_mapping)

    # Remove rows with NaN labels
    df_combined = df_combined.dropna(subset=['label'])

    # Convert labels to integer type
    df_combined['label'] = df_combined['label'].astype(int)

    # Split features and labels
    X = df_combined.claim
    y = df_combined.label

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

def tfidf_transform(X_train: list[str], X_test: list[str]) -> Tuple[np.ndarray, np.ndarray]:
    """
    Transform text data into TF-IDF vectors.

    Args:
        X_train (list[str]): Training texts.
        X_test (list[str]): Testing texts.

    Returns:
        X_train_tfidf, X_test_tfidf: Transformed training and testing data as TF-IDF vectors.
    """
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_test_tfidf

def objective(trial) -> float:
    """
    Objective function for Optuna to optimize LightGBM hyperparameters.

    Args:
        trial: An Optuna trial object.

    Returns:
        Accuracy score of the model with the trial's hyperparameters.
    """
    # Define the hyperparameters to tune
    param = {
        'objective': 'binary',
        'metric': 'accuracy',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'num_boost_round': trial.suggest_int('num_boost_round', 50, 1000),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 0.1, 10.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0)
    }

    # Transform the text data using TF-IDF
    X_train_tfidf, X_test_tfidf = tfidf_transform(X_train, X_test)

    # Train and evaluate the LightGBM model
    model = LGBMClassifier(**param)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    return accuracy_score(y_test, y_pred)

def optimize_hyperparameters() -> Dict[str, Any]:
    """
    Optimize LightGBM hyperparameters using Optuna and return the best parameters.

    Returns:
        best_params: Best hyperparameters found by Optuna.
    """
    # Create an Optuna study
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    # Print the best trial results
    print('Best trial:')
    trial = study.best_trial
    print(f'  Value: {trial.value}')
    print('  Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')

    return study.best_params

def train_and_evaluate_model(best_params: Dict[str, Any], X_train: list[str], X_test: list[str], y_train: pd.Series, y_test: pd.Series) -> None:
    """
    Train and evaluate the LightGBM model with the best hyperparameters.

    Args:
        best_params (Dict[str, Any]): Best hyperparameters found by Optuna.
        X_train (list[str]): Training texts.
        X_test (list[str]): Testing texts.
        y_train (pd.Series): Training labels.
        y_test (pd.Series): Testing labels.
    """
    # Transform the text data using TF-IDF
    X_train_tfidf, X_test_tfidf = tfidf_transform(X_train, X_test)

    # Train the model with the best parameters
    best_model = LGBMClassifier(**best_params)
    best_model.fit(X_train_tfidf, y_train)
    y_pred_train = best_model.predict(X_train_tfidf)
    y_pred_test = best_model.predict(X_test_tfidf)

    # Calculate accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print accuracy results
    print(f"Точність на навчальній вибірці: {train_accuracy:.4f}")
    print(f"Точність на тестовій вибірці: {test_accuracy:.4f}")

    # Plot training and testing errors
    plt.figure(figsize=(10, 5))
    plt.bar(['Train Error', 'Test Error'], [1 - train_accuracy, 1 - test_accuracy], color=['blue', 'orange'])
    plt.title('Навчальні і тестові помилки для LightGBM з використанням TF-IDF')
    plt.ylabel('Помилка')
    plt.ylim(0, 1)
    plt.show()

def main():
    """
    Main function to execute the workflow: data loading, hyperparameter optimization,
    and model training and evaluation.
    """
    # Load and prepare data
    X_train, X_test, y_train, y_test = load_and_prepare_data()

    # Optimize hyperparameters
    best_params = optimize_hyperparameters()

    # Train and evaluate the model with the best hyperparameters
    train_and_evaluate_model(best_params, X_train, X_test, y_train, y_test)

# Execute the main function
if __name__ == "__main__":
    main()
