<a href="https://colab.research.google.com/github/ricardoskewes/68610finalprojectgroup48/blob/colab_branch/truly_final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit
rm -rf 68610finalprojectgroup48
git clone https://github.com/ricardoskewes/68610finalprojectgroup48.git


Cloning into '68610finalprojectgroup48'...


In [91]:
!pip install sacrebleu simpletransformers
!pip install wordfreq
!pip install wandb




# Setup and imports

In [107]:

# Append the cloned repository to sys.path

import sys
sys.path.append("/content/68610finalprojectgroup48")
projectDir = "/content/68610finalprojectgroup48/tweepfake"
sys.path.insert(0, projectDir)
resultsDir = projectDir+"/data/results"

# Standard libraries
import os
import csv
import logging
import math
import datetime

# Data manipulation libraries
import numpy as np
import pandas as pd

# NLP libraries
import spacy
from textblob import TextBlob
from wordfreq import word_frequency

#  confusion matrix stuff
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
import torch
from transformers import set_seed
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# wandb library
import wandb

#DataHandler
from DataHandler import DataHandler

from datasets import Dataset

# Set random seed for reproducibility
random_state = 523
set_seed(random_state)


# Data loading

In [108]:
# Data loading code (unchanged)
from DataHandler import DataHandler
dh = DataHandler()

# Paths to datasets
csvTrainDataset = projectDir + "/data/splits/train.csv"
csvValDataset = projectDir + "/data/splits/validation.csv"
csvTestDataset = projectDir + "/data/splits/test.csv"

# Load datasets
dfTrain = dh.readCSVData(csvTrainDataset)
dfVal = dh.readCSVData(csvValDataset)
dfTest = dh.readCSVData(csvTestDataset)

# Select interesting columns for this study
dfTrainDataset = dfTrain[["screen_name", "text", "account.type"]]
dfValDataset = dfVal[["screen_name", "text", "account.type"]]
dfTestDataset = dfTest[["screen_name", "text", "account.type"]]

# Prepare training data
X_train_all = dfTrainDataset.drop(columns=['screen_name'])
X_train_all.columns = ["text", "label"]

X_val_all = dfValDataset.drop(columns=['screen_name'])
X_val_all.columns = ["text", "label"]

X_test_all = dfTestDataset.drop(columns=['screen_name'])
X_test_all.columns = ["text", "label"]

# Map labels to integers
dictLabels = {"human": 0, "bot": 1}

X_train_all["label"] = X_train_all["label"].apply(lambda x: dictLabels[x])
X_val_all["label"] = X_val_all["label"].apply(lambda x: dictLabels[x])
X_test_all["label"] = X_test_all["label"].apply(lambda x: dictLabels[x])

# Extract labels
y_train = X_train_all["label"]
y_val = X_val_all["label"]
y_test = X_test_all["label"]

train_labels = y_train.tolist()
val_labels = y_val.tolist()
test_labels = y_test.tolist()


# Feature Functions

In [109]:
# Feature creating functions (unchanged)
spacy_nlp = spacy.load("en_core_web_sm")

def get_spacy_doc(text):
    return spacy_nlp(text)

def switch_spacy_to_text(func):
    def inner1(*args, **kwargs):
        spacy_doc = get_spacy_doc(args[0])
        return func(spacy_doc, **kwargs)
    return inner1

def count_words(text):
    return len(text.split())  # C1

def avg_word_length(text):
    words = text.split()  # C2
    return np.mean([len(w) for w in words])

@switch_spacy_to_text
def get_ANP(spacy_doc):
    # Get total length (only tokens that are not punctuation or spaces)
    filtered_tokens = [token for token in spacy_doc if not token.is_punct and not token.is_space]
    total_tokens = len(filtered_tokens)

    adjectives = [token.text for token in filtered_tokens if token.pos_ == "ADJ"]
    nouns = [token.text for token in filtered_tokens if token.pos_ == "NOUN"]
    pronouns = [token.text for token in filtered_tokens if token.pos_ == "PRON"]

    if total_tokens == 0:
        # If no valid tokens, set densities to 0 to avoid division by zero
        adj_density = 0
        noun_density = 0
        pronoun_density = 0
    else:
        adj_density = len(adjectives) / total_tokens
        noun_density = len(nouns) / total_tokens
        pronoun_density = len(pronouns) / total_tokens

    return adjectives, nouns, pronouns, adj_density, noun_density, pronoun_density

def get_ANP_clean(text):
    _, _, _, adj_density, noun_density, pronoun_density = get_ANP(text)
    return adj_density, noun_density, pronoun_density  # C3

@switch_spacy_to_text
def get_capitalizations(spacy_doc):
    capitalizations = [token.text for token in spacy_doc if token.text.isupper()]
    return capitalizations, len(capitalizations) / len(spacy_doc)

def get_capitalizations_clean(text):
    _, cap_ratio = get_capitalizations(text)
    return np.round(cap_ratio, 2)  # C4

@switch_spacy_to_text
def get_sentiment(spacy_doc):
    polarity, subjectivity = TextBlob(spacy_doc.text).sentiment
    return np.round(polarity, 3), np.round(subjectivity, 3)  # C5

@switch_spacy_to_text
def calculate_rarity_scores(spacy_doc, lang='en'):
    adj_rarity_scores = []
    noun_rarity_scores = []

    for token in spacy_doc:
        if token.pos_ == "ADJ":
            adjective = token.text.lower()
            freq = word_frequency(adjective, lang)
            adj_rarity_score = -math.log(freq) if freq > 0 else 0
            adj_rarity_scores.append(adj_rarity_score)
        if token.pos_ == "NOUN":
            noun = token.text.lower()
            freq = word_frequency(noun, lang)
            noun_rarity_score = -math.log(freq) if freq > 0 else 0
            noun_rarity_scores.append(noun_rarity_score)
    res = [0, 0]
    # Calculate median rarity score over all NAs
    if noun_rarity_scores:
        res[0] = np.median(noun_rarity_scores)
    if adj_rarity_scores:
        res[1] = np.median(adj_rarity_scores)
    return tuple(np.round(res, 3))

@switch_spacy_to_text
def get_punctuation(spacy_doc):
    punctuation = [token.text for token in spacy_doc if token.pos_ == "PUNCT"]
    return punctuation, len(punctuation) / len(spacy_doc)

def get_punctuation_clean(text):
    _, punct_ratio = get_punctuation(text)
    return np.round(punct_ratio, 2)  # C7

# Dictionary of classical features
C = {
    1: count_words,             # C1: Number of words in sentence
    2: avg_word_length,         # C2: Average word length in sentence
    3: get_ANP_clean,           # C3: Density scores of adjectives, pronouns, and nouns
    4: get_capitalizations_clean,  # C4: Density of capital letters
    5: get_sentiment,           # C5: Sentiment analysis
    6: calculate_rarity_scores, # C6: Noun and adjective rarity scores
    7: get_punctuation_clean    # C7: Density of punctuation
}




# Model definitions

In [110]:
# Model descriptions
model_descriptions = {
    1: 'BERT*(tweet)',
    2: 'BERT*({tweet <SEP> C1 <SEP> C2 ...})',
    3: 'BERT*({tweet <FEAT> C1 <FEAT> C2 ...})',
}

# Model-specific arguments (excluding hyperparameters). THESE ARE NOT MEANT TO BE SWEPT OVER, BUT JUST DEFINED.
model_args_dict = {
    1: {'transformer_type': 'bert', 'transformer_name': 'bert-base-cased', 'output_dir': 'type1'},
    2: {'transformer_type': 'bert', 'transformer_name': 'bert-base-cased', 'output_dir': 'type2'},
    3: {'separator_token_name': 'FEAT', 'transformer_type': 'bert', 'transformer_name': 'bert-base-cased', 'output_dir': 'type3'},  # Additional arguments can be added here
    # Add arguments for models 4 to 9
}

# Define which hyperparameters are applicable to each model. THESE ARE MEANT TO BE ABLE TO BE SWEPT OVER. OVERRIDE DEFAULTS USING WANDB SWEEPS
model_hyperparameters = {
    1: {"learning_rate": 1e-5, "train_batch_size": 16, "num_train_epochs": 3,},
    2: {"learning_rate": 1e-5, "train_batch_size": 16, "num_train_epochs": 3, "C_ids": [1,2]},
    3: {"learning_rate": 1e-5, "train_batch_size": 16, "num_train_epochs": 3, "C_ids": [3,4,5]},
}

#  Initialize wandb


In [111]:
# Initialize wandb
import wandb
wandb.login()

# Set the wandb project name
WANDB_PROJECT_NAME = "6-861-finalproj"


# Prepare data

In [112]:



# all this thing outputs is the dictionary needed to append special tokens
def create_special_tokens_dict(model_id, model_args_dict):
    """
    Create a dictionary of special tokens based on the model_id.

    Args:
        model_id (int): The ID of the model to determine special tokens.
        model_args_dict (dict): Dictionary of model-specific arguments.
        hyperparameters (dict): Dictionary of hyperparameters for training.
    Returns:
        dict: Dictionary of special tokens.
    """
    additional_tokens = []
    if model_id == 3:
        additional_tokens.append(model_args_dict['separator_token_name'])
    return {'additional_special_tokens': additional_tokens}




def initialize_model_with_special_tokens (model_id, model_args_dict, hyperparameters, special_tokens_dict, model_descriptions):
    """
    Initialize the model and tokenizer with special tokens.

    Parameters:
        transformer_type (str): The type of the transformer (e.g., 'bert').
        transformer_name (str): The name of the pretrained transformer model.
        model_id (int): The ID of the model.
        hyperparameters (dict): Dictionary of hyperparameters, both for training (i.e., learning_rate,etc.) and otherwise fine-tunable (for some model_ids an example is numbeer of classical features).
        model_args_dict: (dict): Dictionary of model-specific arguments.
        special_tokens_dict (dict): Dictionary of special tokens to add.

    Returns:
        model: The initialized model.
        tokenizer: The tokenizer with added special tokens.
    """

    # Initialize model arguments
    model_args = ClassificationArgs()
    model_args.preprocessed = True
    # model_args.dataset_format = "arrow"

    # hyperparam
    for param, value in hyperparameters[model_id].items():
          setattr(model_args, param, value)

    # Apply model-specific args.
    for param, value in model_args_dict[model_id].items():
        # Convert any parameter keys if needed
        setattr(model_args, param, value)

    # for param in model_args_dict[model_id]:
    #     setattr(model_args, param, model_args_dict[model_id][param])


    model_args.model_description = model_descriptions[model_id]

    # Key settings for using HF datasets
    model_args.reprocess_input_data = False
    model_args.preprocessed = True
    model_args.preprocess_inputs = False
    model_args.sliding_window = False
    model_args.use_hf_datasets = False
    model_args.manual_seed = random_state
    model_args.num_labels = 2
    model_args.overwrite_output_dir = True
    model_args.output_dir = f"outputs/model_{model_id}"



    model = ClassificationModel(model_args_dict[model_id]['transformer_type'],
                                model_args_dict[model_id]['transformer_name'],
                                args=model_args,
                                use_cuda=torch.cuda.is_available())
    tokenizer = model.tokenizer
    if model_args_dict[model_id]['transformer_type'] in ['bert', 'roberta']:
      tokenizer.model_max_length = 512


    # Add new special tokens
    if special_tokens_dict:
        tokenizer.add_special_tokens(special_tokens_dict)
        model.model.resize_token_embeddings(len(tokenizer))

    return model, tokenizer


def encode_with_tokens(
      tweet,
      tokenizer,
      feat_sep_token_id=None,
      C_functions=None,
      initial_token_id=None,
      max_feature_tokens=None,
      max_text_tokens=None
  ):
      """
      Encode a text and additional features with separation tokens, providing more flexible
      control over how much space features and text occupy.

      Parameters:
          tweet (str): The main text to encode.
          tokenizer: The Hugging Face tokenizer to use for encoding.
          feat_sep_token_id (int): Token ID for the feature separator token. If None,
              defaults to the model's sep or eos token.
          C_functions (list): List of feature functions that take the tweet as input and
              return a feature value (string, number, or tuple).
          initial_token_id (int): The initial token ID (e.g., CLS or BOS). If None,
              defaults to tokenizer.cls_token_id or tokenizer.bos_token_id.
          max_feature_tokens (int): Maximum total number of tokens allocated to features.
              If None, no specific limit other than model_max_length is enforced.
          max_text_tokens (int): Maximum number of tokens allocated to the main text.
              If None, no specific limit other than model_max_length is enforced.

      Returns:
          list: The final encoded sequence of token IDs.
      """
      if initial_token_id is None:
          initial_token_id = tokenizer.cls_token_id or tokenizer.bos_token_id

      if feat_sep_token_id is None:
          feat_sep_token_id = tokenizer.sep_token_id or tokenizer.eos_token_id

      model_max_length = tokenizer.model_max_length

      # Start sequence with initial token
      encoded_sequence = [initial_token_id]

      # Calculate a per-feature max length if applicable
      feature_max_length = model_max_length  # default if no constraints
      if C_functions and max_feature_tokens is not None and len(C_functions) > 0:
          # Ensure a positive integer length
          # For example, allocating 3x the max_feature_tokens across C_functions:
          feature_max_length = int(max(1, (3 * max_feature_tokens) / len(C_functions)))
      else:
          # If no constraints, just rely on model_max_length
          feature_max_length = model_max_length

      # Encode features
      feature_tokens = []
      if C_functions is not None:
          for func in C_functions:
              feature_tokens.append(feat_sep_token_id)
              feature_value = func(tweet)
              if isinstance(feature_value, tuple):
                  feature_value = ' '.join(map(str, feature_value))
              else:
                  feature_value = str(feature_value)

              # Encode feature with truncation
              encoded_feature = tokenizer.encode(
                  feature_value,
                  add_special_tokens=False,
                  truncation=True,
                  max_length=feature_max_length
              )
              feature_tokens.extend(encoded_feature)

      # If there's a max_feature_tokens limit, truncate
      if max_feature_tokens is not None and len(feature_tokens) > max_feature_tokens:
          feature_tokens = feature_tokens[:max_feature_tokens]

      # Add features to the main sequence
      encoded_sequence.extend(feature_tokens)

      # Compute available space for text
      # Reserve one token for the final separator
      available_for_text = model_max_length - len(encoded_sequence) - 1

      # If max_text_tokens is provided, constrain further
      if max_text_tokens is not None:
          available_for_text = min(available_for_text, max_text_tokens)

      # Ensure available_for_text is at least 1
      available_for_text = max(1, available_for_text)

      # Encode the main tweet text with truncation
      # Add special tokens, but rely on manual truncation length computed above
      encoded_tweet = tokenizer.encode(
          tweet,
          add_special_tokens=True,
          truncation=True,
          max_length=available_for_text
      )

      # If the initial token of encoded_tweet matches our initial_token_id, remove it
      if encoded_tweet and encoded_tweet[0] == initial_token_id:
          encoded_tweet = encoded_tweet[1:]

      # Add truncated tweet tokens
      encoded_sequence.extend(encoded_tweet)

      # Add final separator token at the end
      encoded_sequence.append(feat_sep_token_id)

      # Final safety truncation if still somehow exceeded
      if len(encoded_sequence) > model_max_length:
          encoded_sequence = encoded_sequence[:model_max_length]

      return encoded_sequence


def encode_with_tokens_based_on_model_id(tweet, tokenizer, model_id, model_args_dict, model_hyperparameters):
  if 'separator_token_name' in model_args_dict[model_id]:
    feat_sep_token_id = tokenizer.convert_tokens_to_ids(model_args_dict['separator_token_name'])
  else:
    feat_sep_token_id = None

  if 'C_ids' in model_hyperparameters[model_id]:
    C_functions = [C[i] for i in model_hyperparameters[model_id]['C_ids']]
  elif 'C_ids' in model_args_dict[model_id]:
      C_functions = [C[i] for i in model_args_dict[model_id]['C_ids']]
  else:
      C_functions = None

  max_feature_tokens = model_hyperparameters[model_id].get('max_feature_tokens', model_args_dict[model_id].get('max_feature_tokens', None))
  # max_seq_length = model_hyperparameters[model_id].get('max_seq_length', model_args_dict[model_id].get('max_seq_length', None))

  # if 'max_feature_tokens' in model_hyperparameters[model_id]:
  #   max_feature_tokens = model_hyperparameters[model_id]['max_feature_tokens']
  # elif 'max_feature_tokens' in model_args_dict[model_id]:
  #   max_feature_tokens = model_args_dict[model_id]['max_feature_tokens']
  # else:
  #   max_feature_tokens = None

  # if 'max_seq_length' in model_hyperparameters[model_id]:
  #   max_seq_length = model_hyperparameters[model_id]['max_seq_length']
  # elif 'max_seq_length' in model_args_dict[model_id]:
  #   max_seq_length = model_args_dict[model_id]['max_seq_length']
  # else:
  #   max_seq_length = None

  encoded_ids = encode_with_tokens(
      tweet,
      tokenizer,
      feat_sep_token_id = feat_sep_token_id,
      C_functions= C_functions,
      initial_token_id = None,
      max_feature_tokens=max_feature_tokens,
      max_text_tokens = None
      )

  return encoded_ids

def prepare_hf_dataset(X, model_id, model_args_dict, model_hyperparameters, tokenizer):
    """
    Convert a dataframe with ['text', 'label'] into a dictionary suitable for
    Simple Transformers

    Simple Transformers will handle the data conversion internally. We just need
    to provide a dictionary with 'input_ids', 'attention_mask', and 'labels'.
    """
    max_length = tokenizer.model_max_length
    input_ids_list = []
    attention_masks = []
    labels_list = []

    for i, row in X.iterrows():
        text = row['text']
        label = row['label']
        encoded_ids = encode_with_tokens_based_on_model_id(text, tokenizer, model_id, model_args_dict, model_hyperparameters)

        # Pad/truncate
        if len(encoded_ids) < max_length:
            encoded_ids = encoded_ids + [tokenizer.pad_token_id] * (max_length - len(encoded_ids))
        else:
            encoded_ids = encoded_ids[:max_length]

        # Create attention mask
        attention_mask = [1 if token_id != tokenizer.pad_token_id else 0 for token_id in encoded_ids]

        input_ids_list.append(encoded_ids)
        attention_masks.append(attention_mask)
        labels_list.append(label)

    labels_list = [int(l) for l in labels_list]

    data_dict = {
        'input_ids': input_ids_list,
        'attention_mask': attention_masks,
        'labels': labels_list
    }
    df = pd.DataFrame(data_dict)
    # assert all(isinstance(x, int) for x in df['labels']), "All labels must be single integers, not lists."


    return df



# Training loop

In [113]:

def train_model(X_train, X_val, model_id, model_hyperparameters, model_args_dict, wandb_project_name):
    """
    Train a model based on the model_id and hyperparameters.

    Args:
        model_id (int): The ID of the model to train.
        hyperparameters (dict): Dictionary of hyperparameters for training.

    Returns:
        model: The trained model.
    """

    # Set up wandb run
    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    run_name = f"Model_{model_id}_{current_time}"
    tags = [
        f"model_{model_id}",
        f"date_{current_time.split('_')[0]}",
    ]
    config_stuff = {**model_args_dict[model_id], **model_hyperparameters[model_id]}
    # Add hyperparameters to tags
    for param in config_stuff:
        tags.append(f"{param}_{config_stuff[param]}")

    # Initialize wandb run
    run = wandb.init(
        project=wandb_project_name,
        name=run_name,
        notes=model_descriptions[model_id],
        tags=tags,
        config=config_stuff
    )

    special_tokens_dict = create_special_tokens_dict(model_id, model_args_dict)
    model, tokenizer = initialize_model_with_special_tokens(model_id, model_args_dict, model_hyperparameters, special_tokens_dict, model_descriptions)

    train_dataset = prepare_hf_dataset(X_train, model_id, model_args_dict, model_hyperparameters, tokenizer)
    val_dataset = prepare_hf_dataset(X_val, model_id, model_args_dict, model_hyperparameters, tokenizer)

    # train_data_prepared = encode_with_tokens_based_on_model_id(X_train, tokenizer, model_id, model_args_dict, model_hyperparameters)
    # val_data_prepared = encode_with_tokens_based_on_model_id(X_val, tokenizer, model_id, model_args_dict, model_hyperparameters)

    # Log the model architecture
    wandb.watch(model.model, log='all')

    # Train the model
    model.train_model(train_dataset, eval_data = val_dataset)

    # Evaluate the model on validation data
    result, model_outputs, wrong_predictions = model.eval_model(val_dataset)

    # Log evaluation metrics
    wandb.log(result)

    # Log an example input (first input ids)
    # wandb.log({'example_input_ids': train_dataset[0]['input_ids']})
    wandb.log({'example_input_ids': train_dataset.iloc[0]['input_ids']})

    # Log the model as an artifact with all necessary data for replication
    artifact = wandb.Artifact(
        name=f"model_{model_id}_{run.id}",
        type='model',
        description=model_descriptions[model_id],
        metadata={
            'model_id': model_id,
            'model_hyperparams': model_hyperparameters[model_id],
            'model_args': model_args_dict[model_id],
            'random_state': random_state,
        }
    )
    artifact.add_dir(model_args_dict.output_dir)
    run.log_artifact(artifact)

    # # Finish wandb run
    # run.finish()

    return model, tokenizer


# Evaluation


In [114]:
def evaluate_model(X_test, model_id, model, tokenizer, model_hyperparameters, model_args_dict, wandb_project_name):
    """
    Evaluate the trained model on test data.

    Args:
        model: The trained model.
        X_test (DataFrame): Test data.
        hyperparameters (dict): Hyperparameters used during training.

    Returns:
        None
    """
    # Prepare test data
    # test_data_prepared = encode_with_tokens_based_on_model_id(X_test, tokenizer, model_id, model_args_dict, model_hyperparameters)
    test_dataset = prepare_hf_dataset(X_test, model_id, model_args_dict, model_hyperparameters, tokenizer)

    # Evaluate the model
    # result, model_outputs, wrong_predictions = model.eval_model(test_data_prepared)
    result, model_outputs, wrong_predictions = model.eval_model(test_dataset)

    # Log evaluation metrics to wandb
    wandb.log(result)

    true_labels = np.array(test_dataset['labels'])

    predictions = np.argmax(model_outputs, axis=1)

    # Plot confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(6,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    # Log confusion matrix
    wandb.log({"confusion_matrix": wandb.Image(plt)})
    plt.close()

In [None]:
# model, tokenizer = train_model(
#         X_train_all,
#         X_val_all,
#         chosen_model_id,
#         model_hyperparameters,
#         model_args_dict,
#         WANDB_PROJECT_NAME
#     )

# Sweep config and training function

In [115]:
def sweep_train():
    # Initialize the run for this set of hyperparameters
    run = wandb.init()  # Do not remove this, this is the only init call

    config = wandb.config
    chosen_model_id = config.get('model_id', 1)

    # Update hyperparameters based on config
    if 'learning_rate' in config:
        model_hyperparameters[chosen_model_id]['learning_rate'] = config.learning_rate
    if 'train_batch_size' in config:
        model_hyperparameters[chosen_model_id]['train_batch_size'] = config.train_batch_size
    if 'num_train_epochs' in config:
        model_hyperparameters[chosen_model_id]['num_train_epochs'] = config.num_train_epochs
    if 'C_ids' in config:
        model_hyperparameters[chosen_model_id]['C_ids'] = config.C_ids

    # Train the model (do NOT call wandb.init inside train_model)
    model, tokenizer = train_model(
        X_train_all,
        X_val_all,
        chosen_model_id,
        model_hyperparameters,
        model_args_dict,
        WANDB_PROJECT_NAME
    )

    # Evaluate the model
    evaluate_model(
        X_test_all,
        chosen_model_id,
        model,
        tokenizer,
        model_hyperparameters,
        model_args_dict,
        WANDB_PROJECT_NAME
    )

    # Finish the run once everything is done
    wandb.finish()



In [116]:
sweep_config = {
    'method': 'grid',
    'metric': {'name': 'eval_loss', 'goal': 'minimize'},
    'parameters': {
        'model_id': {
            'values': [2]
        },
        'learning_rate': {
            'values': [1e-5, 3e-5, 5e-5]
        },
        'train_batch_size': {
            'values': [16, 32]
        },
        'C_ids': {
            'values': [[1,2,3], [1,2,3,4,5,6,7]]
        }
    }
}


In [117]:
sweep_id = wandb.sweep(sweep_config, project=WANDB_PROJECT_NAME)
wandb.agent(sweep_id, sweep_train, count= 1 )

Create sweep with ID: tv5336b2
Sweep URL: https://wandb.ai/rskewes-harvard-university/6-861-finalproj/sweeps/tv5336b2


[34m[1mwandb[0m: Agent Starting Run: ql5y35xg with config:
[34m[1mwandb[0m: 	C_ids: [1, 2, 3]
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	model_id: 2
[34m[1mwandb[0m: 	train_batch_size: 16




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  data = torch.load(cached_features_file)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/1295 [00:00<?, ?it/s]

  with amp.autocast():


Run ql5y35xg errored:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "<ipython-input-115-7f700197e6a1>", line 19, in sweep_train
    model, tokenizer = train_model(
  File "<ipython-input-113-5452b898e201>", line 47, in train_model
    model.train_model(train_dataset, eval_data = val_dataset)
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/classification/classification_model.py", line 630, in train_model
    global_step, training_details = self.train(
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/classification/classification_model.py", line 906, in train
    loss, *_ = self._calculate_loss(
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/classification/classification_model.py", line 2340, in _calculate_loss
    outputs = model(**inputs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line