In [None]:
!jupyter kernelspec list

In [None]:
# Import necessary libraries
import sys
import pathlib
import string
import re
import os
import logging
import functools

# Add the scripts folder to the system path
sys.path.append("../scripts")
import normalize_text_bootcamp
import utils_bootcamp
import plotting

logging.basicConfig(level=logging.INFO)

# Import data handling and visualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray

# PyTorch for deep learning
import torch
import torch.nn.functional

# Scikit-learn for data processing and metrics
import sklearn.metrics
import sklearn.model_selection

# Hugging Face for DeBERTa model
import datasets
import transformers

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# Load the dataset
file_path = "/p/project/deepacf/maelstrom/haque1/dataset/tweets_2017_01_era5_normed_filtered.nc"
ds_tweets = xarray.load_dataset(file_path)

In [None]:
ds_tweets

In [None]:
key_tp = "tp_h"
ds_tweets["raining"] = (["index"], ds_tweets[key_tp].values > 1e-8)

indices_train, indices_test = sklearn.model_selection.train_test_split(
    np.arange(ds_tweets["index"].shape[0]),
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=ds_tweets["raining"].values,
)

In [None]:
ds_tweets

In [None]:
# plt.hist(indices_train, bins=np.unique(indices_train), alpha=0.5, label='Training Set')
# plt.hist(indices_test, bins=np.unique(indices_test), alpha=0.5, label='Test Set')
# plt.xlabel('Label')
# plt.ylabel('Count')
# plt.title('Label Distribution in Training and Test Set')
# plt.legend()
# plt.show()

In [None]:
# Load the pretrained tokenizer
model_nm = "/p/project/deepacf/maelstrom/haque1/deberta-v3-small"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_nm)
db_config_base = transformers.AutoConfig.from_pretrained(model_nm, num_labels=2)


# Function to tokenize text
def tokenize_function(examples):
    return tokenizer(examples["inputs"], padding=True, truncation=True)


# Prepare the dataset for the Hugging Face model
def get_dataset(ds, tokenizer, indices_train, indices_test):
    df = ds[["text_normalized", "raining"]].to_dataframe()
    df = df.rename(columns={"text_normalized": "inputs", "raining": "label"})
    datasets_ds = datasets.Dataset.from_pandas(df)
    tok_ds = datasets_ds.map(tokenize_function, batched=True)
    train_dataset = tok_ds.select(indices_train)
    test_dataset = tok_ds.select(indices_test)
    return datasets.DatasetDict({"train": train_dataset, "test": test_dataset})


# Create the dataset
dataset = get_dataset(ds_tweets, tokenizer, indices_train, indices_test)

In [None]:
# Define hyperparameters
parameters = {
    "learning_rate": 8e-5,
    "batch_size": 16,
    "weight_decay": 0.01,
    "epochs": 1,
    "warmup_ratio": 0.1,
    "cls_dropout": 0.3,
    "lr_scheduler_type": "cosine",
}

# Specify your personal folder for model outputs
FOLDER_TO_OUTPUT = "/p/project/deepacf/maelstrom/your_user/model/"


# Function to get the model
def get_model(params, db_config_base, model_nm):
    db_config = db_config_base
    if params is not None:
        db_config.update({"cls_dropout": params["cls_dropout"]})
    db_config.update({"num_labels": 2})
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_nm, config=db_config)
    return model


# Function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"f1": sklearn.metrics.f1_score(labels, predictions)}


# Function to get the trainer
def get_trainer(dataset, parameters):
    training_args = transformers.TrainingArguments(
        output_dir=FOLDER_TO_OUTPUT,
        learning_rate=parameters["learning_rate"],
        per_device_train_batch_size=parameters["batch_size"],
        per_device_eval_batch_size=parameters["batch_size"],
        num_train_epochs=parameters["epochs"],
        weight_decay=parameters["weight_decay"],
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        save_strategy="epoch",
        warmup_ratio=parameters["warmup_ratio"],
    )

    return transformers.Trainer(
        model_init=lambda: get_model(parameters, db_config_base, model_nm),
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_metrics,
    )

In [None]:
# Initialize the trainer
trainer = get_trainer(dataset, parameters)

In [None]:
# Start training
trainer.train()

In [None]:
# Evaluate the model on the test set
results = trainer.evaluate()

# Load the test dataset
test_dataset = prepare_dataset(ds_tweets, tokenizer, indices_train, indices_test, train=False)

# Make predictions
predictions = trainer.predict(test_dataset["test"]).predictions
predictions = np.argmax(predictions, axis=1)

# True labels
true_labels = test_dataset["test"]["label"]

# Confusion Matrix
conf_matrix = sklearn.metrics.confusion_matrix(true_labels, predictions)
print(conf_matrix)

# ROC Curve and AUC
fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_labels, predictions)
auc = sklearn.metrics.auc(fpr, tpr)

# Plotting ROC Curve
plt.figure()
plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % auc)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Prepare the full dataset
full_dataset = prepare_dataset(ds_tweets, tokenizer, indices_train, indices_test)

# Initialize the trainer with the full dataset
full_trainer = get_trainer(full_dataset, parameters)

# Train on the full dataset
full_trainer.train()

# Evaluate and analyze the results as done in Exercise 7

## 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import Dataset
import torch

# Load your dataset (replace this with your actual loading code)
# For example, if your xarray dataset is named 'dataset':
# df = dataset.to_dataframe()
# Assuming a simple DataFrame with columns 'tweet' and 'label'
df = pd.DataFrame(
    {
        "tweet": [
            "It is raining today",
            "What a sunny day",
            "Raining again!",
            "No rain today",
        ],
        "label": [1, 0, 1, 0],  # 1 for 'Raining', 0 for 'Not Raining'
    }
)

# Text Preprocessing
# Add any specific text preprocessing here if needed

# Splitting the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df["tweet"], df["label"], test_size=0.2)

# Load DeBERTa tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)


# Create a Dataset object
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TweetDataset(train_encodings, train_labels.tolist())
val_dataset = TweetDataset(val_encodings, val_labels.tolist())

# Load DeBERTa model for sequence classification
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_result = trainer.evaluate()

# Print evaluation results
print("Evaluation results:", evaluation_result)

# Save the model
model.save_pretrained("./deberta_tweet_classifier")

In [None]:
import matplotlib.pyplot as plt

# Assuming 'evaluation_result' contains the results from the trainer.evaluate()
# Example: evaluation_result = {'eval_loss': 0.123, 'eval_accuracy': 0.95, ...}

# Extract metrics
metrics = evaluation_result.keys()
values = [evaluation_result[metric] for metric in metrics]

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(metrics, values, color="blue")
plt.xlabel("Metrics")
plt.ylabel("Values")
plt.title("Model Evaluation Results")
plt.xticks(rotation=45)
plt.show()

# 3

In [None]:
import pandas as pd
import numpy as np
import xarray
import torch
from torch.utils.data import Dataset
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
FOLDER_DATA = "/p/project/deepacf/maelstrom/haque1/dataset/"
FOLDER_TWEET = FOLDER_DATA + "tweets_2017_01_era5_normed_filtered.nc"
ds_tweets = xarray.load_dataset(FOLDER_TWEET)

# Define labels based on a condition (e.g., a certain threshold)
key_tp = "tp_h"  # Replace with your key
ds_tweets["raining"] = (["index"], ds_tweets[key_tp].values > 1e-8)

# Split the dataset
indices_train, indices_test = train_test_split(np.arange(ds_tweets["index"].shape[0]), test_size=0.2, random_state=42)

# Convert the dataset to pandas DataFrame
df = ds_tweets.to_dataframe().reset_index()
df_train = df.loc[df["index"].isin(indices_train)]
df_test = df.loc[df["index"].isin(indices_test)]

train_texts = df_train["text_normalized"].tolist()
train_labels = df_train["raining"].astype(int).tolist()
val_texts = df_test["text_normalized"].tolist()
val_labels = df_test["raining"].astype(int).tolist()

In [None]:
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

In [None]:
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

In [None]:
evaluation_result = trainer.evaluate()
print("Evaluation results:", evaluation_result)

In [None]:
model.save_pretrained("./deberta_tweet_classifier")

In [None]:
import sys

sys.path.append("../helpers/plotting")
import test_training_distribution
import dataset_length_distribution

In [None]:
test_training_distribution.plot_label_distribution(
    df,
    "raining",
    title="Distribution of Tweets (Raining vs Not Raining)",
    x_label="Raining",
    y_label="Number of Tweets",
)

In [None]:
dataset_length_distribution.plot_numeric_distribution(
    df,
    "text_normalized",
    bins=30,
    title="Distribution of Tweet Lengths",
    x_label="Tweet Length",
    y_label="Frequency",
)

In [None]:
df["tweet_length"] = df["text_normalized"].apply(len)
sns.histplot(df["tweet_length"], bins=30)
plt.title("Distribution of Tweet Lengths")
plt.xlabel("Tweet Length")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Assuming 'test_ds' is your test dataset in the format expected by Hugging Face
# test_ds = get_dataset(
#     ds_tweets.sel(index=indices_test),
#     tok_func,
#     tokenizer,
#     indices_train,
#     indices_test,
#     train=False
# )
test_ds = val_dataset
# Make predictions
preds_output = trainer.predict(test_ds)

In [None]:
preds = torch.nn.functional.softmax(torch.Tensor(preds_output.predictions), dim=1).numpy()
prediction_probability = preds[:, 1]  # Probability of 'Raining'
predictions = preds.argmax(axis=-1)  # Predicted class (0 or 1)

In [None]:
# This is a selection of your xarray dataset corresponding to the test set
ds_test = ds_tweets.sel(index=indices_test)
truth = ds_test.raining.values  # Actual labels

In [None]:
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'predictions' and 'truth' are your model's predictions and the true labels
report = classification_report(truth, predictions, target_names=["Not Raining", "Raining"], output_dict=True)

# Plotting the classification report
sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True)
plt.title("Classification Report")
plt.show()

In [None]:
import roc

roc.plot_roc_curve(
    truth,
    prediction_probability,
    title="My Custom ROC Title",
    color="red",
    linestyle="-.",
    linewidth=2.5,
    legend_loc="upper left",
    figsize=(5, 4),
)

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(truth, prediction_probability)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()

In [None]:
import confusion_matrix

confusion_matrix.plot_confusion_matrix(
    truth,
    predictions,
    labels=["Class 0", "Class 1"],
    title="My Custom Confusion Matrix",
    cmap="Blues",
    figsize=(5, 4),
)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(truth, predictions)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# 4

In [None]:
import xarray as xr
import sys
import transformers
import datasets
import functools
import xarray as xr
import os
from sklearn.model_selection import train_test_split

sys.path.append("../helpers/plotting")
import test_training_distribution

sys.path.append("../helpers")
from transformer_trainer import get_trainer

ds_raw = xr.open_dataset("/p/project/deepacf/maelstrom/haque1/dataset/tweets_2017_01_era5_normed_filtered.nc")

In [None]:
# again define labels
key_tp = "tp_h"
ds_raw["raining"] = (["index"], ds_raw[key_tp].values > 1e-8)

In [None]:
from sklearn.model_selection import train_test_split

labels = ds_raw["raining"]

indices_train, indices_test = train_test_split(ds_raw.index, test_size=0.20, stratify=labels)

In [None]:
# ds_raw

In [None]:
# test_training_distribution.plot_label_distribution_split(
#     ds_raw.index, [indices_train, indices_test],
#     column='raining',
#     titles=['Training Set Label Distribution',
#             'Test Set Label Distribution'],
#     x_label='Label',
#     y_label='Frequency',
#     figsize=(12, 6)
# )

In [None]:
# Load the pretrained tokenizer and model configuration
model_nm = "/p/project/deepacf/maelstrom/haque1/deberta-v3-small"  # Path to model
tokenizer = transformers.AutoTokenizer.from_pretrained(model_nm)
db_config_base = transformers.AutoConfig.from_pretrained(model_nm, num_labels=2)


# Define function to tokenize the field 'inputs' stored in x
def tok_func(x, tokenizer):
    return tokenizer(x["inputs"], padding=True, truncation=True, max_length=512)


# Function to convert the dataset to a format used by Hugging Face
def get_dataset(ds, tok_func, tokenizer, indices_train, indices_test, train=True):
    df = ds[["text_normalized", "raining"]].to_pandas()
    df = df.rename(columns={"text_normalized": "inputs", "raining": "labels"})
    datasets_ds = datasets.Dataset.from_pandas(df)
    tok_function_partial = functools.partial(tok_func, tokenizer=tokenizer)
    tok_ds = datasets_ds.map(tok_function_partial, batched=True)
    if train:
        return datasets.DatasetDict({"train": tok_ds.select(indices_train), "test": tok_ds.select(indices_test)})
    else:
        return tok_ds

In [None]:
dataset = get_dataset(ds_raw, tok_func, tokenizer, indices_train, indices_test)

In [None]:
# dataset

In [None]:
FOLDER_TO_OUTPUT = "./outputs"

In [None]:
parameters = {
    "learning_rate": 8e-5,
    "batch_size": 16,
    "weight_decay": 0.01,
    "epochs": 1,
    "warmup_ratio": 0.1,
    "cls_dropout": 0.3,
    "lr_scheduler_type": "cosine",
}

os.makedirs(FOLDER_TO_OUTPUT, exist_ok=True)

In [None]:
trainer = get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters)

# Start training
trainer.train()

In [None]:
import transformers
import datasets
import sklearn.metrics
import functools
import numpy as np
import os


def get_model(params, db_config_base, model_nm):
    db_config = db_config_base
    if params is not None:
        db_config.update({"cls_dropout": params["cls_dropout"]})
    db_config.update({"num_labels": 2})
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_nm, config=db_config)
    return model


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    classification_report = sklearn.metrics.classification_report(
        labels, predictions, target_names=["not raining", "raining"], output_dict=True
    )
    f1_not_raining = classification_report["not raining"]["f1-score"]
    f1_raining = classification_report["raining"]["f1-score"]
    return {"f1_not_raining": f1_not_raining, "f1_raining": f1_raining}


def get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters):
    args = transformers.TrainingArguments(
        FOLDER_TO_OUTPUT,
        learning_rate=parameters["learning_rate"],
        warmup_ratio=parameters["warmup_ratio"],
        lr_scheduler_type=parameters["lr_scheduler_type"],
        disable_tqdm=False,
        fp16=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=parameters["batch_size"],
        per_device_eval_batch_size=parameters["batch_size"],
        num_train_epochs=parameters["epochs"],
        weight_decay=parameters["weight_decay"],
        report_to="none",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    get_model_partial = functools.partial(get_model, db_config_base=db_config_base, model_nm=model_nm)
    return transformers.Trainer(
        model_init=get_model_partial,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )


parameters = {
    "learning_rate": 8e-5,
    "batch_size": 16,
    "weight_decay": 0.01,
    "epochs": 1,
    "warmup_ratio": 0.1,
    "cls_dropout": 0.3,
    "lr_scheduler_type": "cosine",
}


db_config_base = transformers.AutoConfig.from_pretrained(model_nm)


os.makedirs(FOLDER_TO_OUTPUT, exist_ok=True)

trainer = get_trainer(dataset, db_config_base, model_nm, FOLDER_TO_OUTPUT, parameters)
trainer.train()

In [None]:
# this is the test dataset in the format expected by Hugging Face
test_ds = get_dataset(
    ds_raw.sel(index=indices_test),
    tok_func,
    tokenizer,
    indices_train,
    indices_test,
    train=False,  # not training anymore
)
# this is a selection of our xarray dataset that corresponds to the tweets that are part of the test set
ds_test = ds_raw.sel(index=indices_test)

In [None]:
import sys

sys.path.append("../scripts")
import plotting

preds = torch.nn.functional.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy()
prediction_probability = preds[:, 1]
predictions = preds.argmax(axis=-1)
truth = ds_test.raining.values
plotting.analysis.classification_report(labels=truth, predictions=predictions)
plotting.analysis.plot_roc(truth=truth, prediction_probability=prediction_probability)
plotting.plotting.analysis.check_prediction(truth=truth, prediction=predictions);

In [None]:
def load_saved_trained_model(ds, folder_to_model, db_config_base, model_nm, parameters):
    # load the pretrained tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained(folder_to_model)
    db_config_base = transformers.AutoConfig.from_pretrained(folder_to_model, num_labels=2)
    dataset = get_dataset(ds, tok_func, tokenizer, indices_train, indices_test)
    trainer = get_trainer(dataset, db_config_base, folder_to_model, folder_to_model, parameters)
    return trainer


trainer_evaluate = load_saved_trained_model(
    ds_raw,
    FOLDER_TO_OUTPUT + FOLDER_TO_OUTPUT,
    db_config_base,
    model_nm,
    parameters,
)