In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

# Mount Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/CMSC723
CROWD = "project_materials/umd_reddit_suicidewatch_dataset_v2/crowd"
EXPERT = "project_materials/umd_reddit_suicidewatch_dataset_v2/expert"

# Read Data

In [None]:
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from datasets import Dataset
import pandas as pd
from os.path import join
import matplotlib.pyplot as plt
from datasets.load import load_metric
import numpy as np

In [None]:
crowd_train = pd.read_csv(join(CROWD, "train", "crowd_train.csv"))
crowd_train_posts = pd.read_csv(join(CROWD, "train", "shared_task_posts.csv"))
crowd_test = pd.read_csv(join(CROWD, "test", "crowd_test.csv"))
crowd_test_posts = pd.read_csv(join(CROWD, "test", "shared_task_posts_test.csv"))

# Replace NaNs

In [None]:
crowd_train.dropna(inplace=True)
crowd_test.dropna(inplace=True)

crowd_train_posts["post_title"].fillna(value="", inplace=True)
crowd_test_posts["post_title"].fillna(value="", inplace=True)
crowd_train_posts["post_body"].fillna(value="", inplace=True)
crowd_test_posts["post_body"].fillna(value="", inplace=True)

# Labels

In [None]:
crowd_train["high_severity"] = ((((crowd_train["label"] == "c") | (crowd_train["label"] == "d"))).astype("int"))
crowd_test["high_severity"] = ((((crowd_test["raw_label"] == "c") | (crowd_test["raw_label"] == "d"))).astype("int"))

# Join Crowd Posts and Crowd Training Labels

In [None]:
crowd_train_posts["post_text"] = crowd_train_posts["post_title"] + " " + crowd_train_posts["post_body"]
crowd_test_posts["post_text"] = crowd_test_posts["post_title"] + " " + crowd_test_posts["post_body"]

At first, we tried combining all of the post text for one user into a single input to feed into our LLM. However, the input size for these models is restricted, and since we do not want to truncate our input, we will instead use the suicidality assignment for each Reddit user as a weakly supervised label for the Reddit posts of that user in order to fine-tune our LLM.

In [None]:
crowd_train_df = pd.merge(left=crowd_train_posts, right=crowd_train, on="user_id")
crowd_test_df = pd.merge(left=crowd_test_posts, right=crowd_test, on="user_id")

# Use DistilBERT Tokenizer on Post Text Data

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

def tokenize_function(row):
  return tokenizer(row["post_text"], padding="max_length", truncation=True)

crowd_train_df["post_text_tokenized"] = crowd_train_df.apply(func=lambda row: tokenize_function(row), axis=1)
crowd_test_df["post_text_tokenized"] = crowd_test_df.apply(func=lambda row: tokenize_function(row), axis=1)

In [None]:
crowd_train_df["input_ids"] = crowd_train_df.apply(func=lambda row: row["post_text_tokenized"]["input_ids"], axis=1)
crowd_train_df["attention_mask"] = crowd_train_df.apply(func=lambda row: row["post_text_tokenized"]["attention_mask"], axis=1)
crowd_test_df["input_ids"] = crowd_test_df.apply(func=lambda row: row["post_text_tokenized"]["input_ids"], axis=1)
crowd_test_df["attention_mask"] = crowd_test_df.apply(func=lambda row: row["post_text_tokenized"]["attention_mask"], axis=1)

In [None]:
crowd_train_df = crowd_train_df.rename(columns={"label": "labels"})
crowd_test_df = crowd_test_df.rename(columns={"raw_label": "labels"})

# Train the Model

In [None]:
num_categories = 2
epochs = 1

print("Using %i categories" % num_categories)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased", num_labels=num_categories)

In [None]:
args = TrainingArguments(
          output_dir= "distilbert-base-cased-checkpoint",
          do_train=True,
          do_eval=True,
          num_train_epochs=epochs,
          evaluation_strategy='epoch'
        )

In [None]:
crowd_train_df.drop(labels=['post_text_tokenized'], inplace=True, axis=1)
crowd_test_df.drop(labels=['post_text_tokenized'], inplace=True, axis=1)

In [None]:
crowd_train_dataset = Dataset.from_pandas(crowd_train_df[:10])
crowd_test_dataset = Dataset.from_pandas(crowd_test_df[:10])

In [None]:
from datasets.load import load_metric

def f1_and_accuracy(eval_pred):
    """
    Compute glue_mrpc for the classification task using the
    load_metric function.  This function is needed for the
    compute_metrics argument of the Trainer.

    You shouldn't need to modify this function.

    Keyword args:
    eval_pred -- Output from a classifier with the logits and labels.
    """

    metric_f1 = load_metric("f1")
    metric_accuracy = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print({"f1": metric_f1.compute(predictions=predictions, references=labels)["f1"], "accuracy": metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"]})
    return {"f1": metric_f1.compute(predictions=predictions, references=labels)["f1"], "accuracy": metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"]}

In [None]:
trainer = Trainer(
          model=model,
          args=args,
          train_dataset=crowd_train_dataset,
          eval_dataset=crowd_test_dataset,
          tokenizer=tokenizer,
          compute_metrics=f1_and_accuracy
        )

In [None]:
trainer.train()

# Predict the label for each Reddit User

In [None]:
crowd_train_predictions = trainer.predict(crowd_train_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: subreddit, timestamp, post_text, post_body, __index_level_0__, post_id, user_id, post_title. If subreddit, timestamp, post_text, post_body, __index_level_0__, post_id, user_id, post_title are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 8


{'f1': 1.0, 'accuracy': 1.0}


In [None]:
crowd_train_df["suicidality_probs"] = softmax(crowd_train_predictions.predictions, axis=1)[:,1]

In [None]:
user_post_groups_crowd_train = crowd_train_df.groupby("user_id")
X_train = crowd_train_df.groupby("user_id")['suicidality_probs'].mean().to_numpy().reshape(user_post_groups_crowd_train.ngroups, 1)
y_train = crowd_train_df.groupby("user_id")['labels'].agg(pd.Series.mode).to_numpy()

In [None]:
user_post_groups_crowd_test = crowd_test_df.groupby("user_id")
X_test = crowd_test_df.groupby("user_id")['suicidality_probs'].mean().to_numpy().reshape(user_post_groups_crowd_test.ngroups, 1)
y_test = crowd_test_df.groupby("user_id")['labels'].agg(pd.Series.mode).to_numpy()

In [None]:
clf = DecisionTreeClassifier(max_depth=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shortened_combined_expert_df["suicidality_probs"] = probabilities[:,1]


In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("F1:", metrics.f1_score(y_test, y_pred))

# Save the Model

In [None]:
model.save_pretrained("saved_model")
filename = 'finalized_decision_tree.sav'
pickle.dump(clf, open(filename, 'wb'))