## Rain Classifier through Relevance-Filtered Data Selection

### Objective
The main goal of this project is to develop a rain classifier utilizing relevant Tweets. "Relevant" Tweets are those identified by the relevance classifier as containing adequate information. 

### Key Components
1. **Relevance Classifier Integration:** Implement the [07RelevanceClassifierForNewDataset](https://github.com/rajhaq/AP2-Social-media-data-for-better-local-forecasts/blob/4-train-relevance-classifier/07RelevanceClassifierForNewDataset.ipynb) notebook to filter and select "relevant" Tweets from the dataset. This step ensures that only informative Tweets are utilized for rain classification.
2. **Baseline Model Data:** Utilize a one-year dataset (2017) as the baseline for comparison. The initial dataset will undergo filtration by removing 'snow' related tweets.
3. **Rain Classifier Training (First Iteration):** Initially, train the rain classifier using the one-year dataset (2017), similar to the approach followed during the botcamp Day 2.
4. **Rain Classifier Training (Second Iteration):** Train the rain classifier exclusively using the subset of Tweets marked as "relevant" by the relevance classifier. This step involves retraining the rain classifier to focus solely on informative Tweets.
5. **Performance Evaluation:** Evaluate the performance of both iterations of the rain classifier. Compare the results obtained from the baseline model and the models trained on the entire Tweet dataset and the filtered dataset. This comparative analysis will provide insights into the effectiveness of utilizing relevant Tweets for rain classification.


In [None]:
import sys
import re
import logging
import functools

logging.basicConfig(level=logging.INFO)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray

# Pytorch modules
import torch
import torch.nn.functional

# scikit-learn modules
import sklearn.metrics
import sklearn.model_selection

# "Hugging Face" modules
import datasets
import transformers

sys.path.append("/bootcamp/AP2/scripts")
import plotting

In [None]:
[torch.cuda.device(i) for i in range(torch.cuda.device_count())]

## First Iteration
The rain classifier trained on the one-year dataset (2017).

In [None]:
folder_path = "/p/project/deepacf/maelstrom/haque1/dataset/"
file_name = folder_path + "tweets_2017_era5_normed_filtered.nc"
ds_tweets = xarray.load_dataset(file_name)

In [None]:
# again define labels
key_tp = "tp_h"
ds_tweets["raining"] = (["index"], ds_tweets[key_tp].values > 1e-8)

In [None]:
# removing snow related tweets
ds_tweets = ds_tweets.where(~ds_tweets.text_normalized.str.contains("snow", flags=re.IGNORECASE), drop=True)

In [None]:
indices_train, indices_test = sklearn.model_selection.train_test_split(
    np.arange(ds_tweets["index"].shape[0]),
    random_state=42,
    shuffle=True,
    test_size=0.2,
    stratify=ds_tweets["raining"].values,
)

In [None]:
# load the pretrained tokenizer
model_nm = (
    "/p/project/deepacf/maelstrom/haque1/deberta-v3-small"  # model repo downloaded from Hugging Face to the cluster
)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_nm)
db_config_base = transformers.AutoConfig.from_pretrained(model_nm, num_labels=2)

In [None]:
def tok_func(x, tokenizer):
    """
    tokenizes the field 'inputs' stored in x including padding
    """
    return tokenizer(x["inputs"], padding=True)


def get_dataset(ds, tok_func, tokenizer, indices_train, indices_test, train=True):
    """
    converts dataset to 'dataset' format required by Hugging Face

    Parameters:
    ----------
    ds: dataset
    tok_func: functiond use for tokenization
    indices_train: indices corresponding to the training set
    indices_test: indices corresponding to the training set
    train: if used for training

    Returns
    -------
    header of file
    """
    # converting dataset to pandas as Hugging Face datasets has inbuilt function that converts pandas dataframe to a Hugging Face dataset
    df = ds[["text_normalized", "raining"]].to_pandas()
    df = df.rename(columns={"text_normalized": "inputs"})
    df = df.rename(columns={"raining": "label"})
    datasets_ds = datasets.Dataset.from_pandas(df)
    tok_function_partial = functools.partial(tok_func, tokenizer=tokenizer)
    tok_ds = datasets_ds.map(tok_function_partial, batched=True)
    if train:
        return datasets.DatasetDict({"train": tok_ds.select(indices_train), "test": tok_ds.select(indices_test)})
    else:
        return tok_ds

In [None]:
# create Hugging Face 'dataset'
dataset = get_dataset(ds_tweets, tok_func, tokenizer, indices_train, indices_test)

In [None]:
# we define our hyper-parameters in a dictionary `parameters`
parameters = {}
parameters["learning_rate"] = 8e-5
parameters["batch_size"] = 16
parameters["weight_decay"] = 0.01
parameters["epochs"] = 1
parameters["warmup_ratio"] = 0.1
parameters["cls_dropout"] = 0.3
parameters["lr_scheduler_type"] = "cosine"

FOLDER_TO_OUTPUT = "/p/project/deepacf/maelstrom/haque1/model/"

In [None]:
def get_model(params, db_config_base, model_nm):
    """
    function to retrieve model, format follows Hugging Face convention (parameter -> 'params')
    """
    db_config = db_config_base
    if params is not None:
        db_config.update({"cls_dropout": params["cls_dropout"]})
    db_config.update({"num_labels": 2})
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_nm, config=db_config)
    return model


def compute_metrics(eval_pred):
    """
    compute f1 metrics of both labels, format follows Hugging Face convention

    Parameters:
    ----------
    eval_pred: evaluation/test set probalities for classification task

    Returns
    -------
    dictionary returning labeled f1 score of "not raining" and "raining"
    """
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    classification_report = sklearn.metrics.classification_report(
        labels, predictions, target_names=["not raining", "raining"], output_dict=True
    )
    f1_not_raining = classification_report["not raining"]["f1-score"]
    f1_raining = classification_report["raining"]["f1-score"]
    return {"f1_not_raining": f1_not_raining, "f1_raining": f1_raining}


def get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters):
    """
    initializes `transformers.Trainer`, which is used to train models with Hugging Face

    Hyper parameters are here assigned to model.
    Parameters:
    ----------
    dataset: dataset in format required by Hugging Face
    db_config_base: default model configurations
    model_nm: model folder
    FOLDER_TO_OUTPUT: folder where trained model, tokenizer,... will be saved
    parameters: dictionary of hyper-parameters

    Returns
    -------
    trainer with assigned parameters used for training
    """
    args = transformers.TrainingArguments(
        FOLDER_TO_OUTPUT,
        learning_rate=parameters["learning_rate"],
        warmup_ratio=parameters["warmup_ratio"],
        lr_scheduler_type=parameters["lr_scheduler_type"],
        disable_tqdm=False,
        fp16=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=parameters["batch_size"],
        per_device_eval_batch_size=parameters["batch_size"],
        num_train_epochs=parameters["epochs"],
        weight_decay=parameters["weight_decay"],
        report_to="none",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    # convert `get_model` to partial function to pass it as an argument in `transformers.Trainer`
    # see https://www.geeksforgeeks.org/partial-functions-python/ for quick tutorial
    get_model_partial = functools.partial(get_model, db_config_base=db_config_base, model_nm=model_nm)
    return transformers.Trainer(
        model_init=get_model_partial,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
trainer = get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters)

In [None]:
trainer.train()

In [None]:
test_ds = get_dataset(
    ds_tweets.isel(index=indices_test),  # Use isel() instead of sel() for integer indexing
    tok_func,
    tokenizer,
    indices_train,
    indices_test,
    train=False,  # not training anymore
)
# this is a selection of our xarray dataset that corresponds to the tweets that are part of the test set
ds_test = ds_tweets.isel(index=indices_test)

In [None]:
preds = torch.nn.functional.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy()
prediction_probability = preds[:, 1]
predictions = preds.argmax(axis=-1)
truth = ds_test.raining.values
plotting.analysis.classification_report(labels=truth, predictions=predictions)
plotting.analysis.plot_roc(truth=truth, prediction_probability=prediction_probability)
plotting.plotting.analysis.check_prediction(truth=truth, prediction=predictions);

## Second Iteration
Rain classifier using only the relevant Tweets identified by the relevance classifier.
- Load the relevant classifier.Specify the model path (model_nm) to the desired checkpoint in the output.
- Filter the dataset `ds_tweets` by the relevance classifier.
- Train the model using the new filtered dataset.
- Test the model's performance and plot the ROC curve and confusion matrix. Compare the results with the main classifier.

In [None]:
checkpoint_folder = "/p/project/deepacf/maelstrom/haque1/AP2-Social-media-data-for-better-local-forecasts/tests/outputs/checkpoint-268"  # replace with relevance classifier checkpoint folder

In [None]:
trainer_evaluate = load_saved_trained_model(
    ds_tweets,
    checkpoint_folder,
    db_config_base,
    model_nm,
    parameters,
)
# obtain test dataset in Huggin Face format
trainer_ds = get_dataset(
    ds_tweets,
    tok_func,
    tokenizer,
    [],
    ds_tweets["index"],
    train=False,  # not training anymore
)

In [None]:
# make predictions
preds = torch.nn.functional.softmax(torch.Tensor(trainer_evaluate.predict(trainer_ds).predictions)).numpy()

In [None]:
plt.hist(preds[:, 1], bins=100);

In [None]:
filtered_ds_tweets = ds_tweets.sel(index=ds_tweets["index"][preds[:, 1] > 0.6])

In [None]:
for i, pred in enumerate(preds):
    print(i, pred, pred[1] > 0.6, pred.max())
    if i == 10:
        break

In [None]:
filtered_indices_train, filtered_indices_test = sklearn.model_selection.train_test_split(
    np.arange(filtered_ds_tweets["index"].shape[0]),
    random_state=42,
    shuffle=True,
    test_size=0.2,
    stratify=filtered_ds_tweets["raining"].values,
)

In [None]:
# create Hugging Face 'dataset'
filtered_dataset = get_dataset(filtered_ds_tweets, tok_func, tokenizer, filtered_indices_train, filtered_indices_test)

In [None]:
filtered_trainer = get_trainer(filtered_dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters)

In [None]:
filtered_trainer.train()

In [None]:
filtered_test_ds = get_dataset(
    filtered_ds_tweets.isel(index=filtered_indices_test),  # Use isel() instead of sel() for integer indexing
    tok_func,
    tokenizer,
    filtered_indices_train,
    filtered_indices_test,
    train=False,  # not training anymore
)

In [None]:
# this is a selection of our xarray dataset that corresponds to the tweets that are part of the test set
filtered_ds_test = filtered_ds_tweets.isel(index=filtered_indices_test)

In [None]:
preds = torch.nn.functional.softmax(torch.Tensor(filtered_trainer.predict(filtered_test_ds).predictions)).numpy()
prediction_probability = preds[:, 1]
predictions = preds.argmax(axis=-1)
truth = filtered_ds_test.raining.values

In [None]:
plotting.analysis.classification_report(labels=truth, predictions=predictions)
plotting.analysis.plot_roc(truth=truth, prediction_probability=prediction_probability)
plotting.plotting.analysis.check_prediction(truth=truth, prediction=predictions)