In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np

# Load the datasets

In [41]:
# Step 1: Read the ground truth and Sample CSV files
# Load the datasets
def load_textual_data(file_path):
    descriptions = pd.read_csv(file_path)
    return descriptions

def load_ground_truth(file_path):
    ground_truth = pd.read_excel(file_path, engine='openpyxl')
    return ground_truth

descriptions = load_textual_data('C://Users//riyac//Downloads//Sample.csv')
ground_truth = load_ground_truth('C://Users//riyac//Downloads//ground-truth.xlsx')

# Preprocess the datasets

In [42]:
# List of partial column name matches
partial_matches = [
    'call to go online', 'online contact information provided', 
    'visual or verbal call to purchase', 'portray a sense of urgency to act', 
    'incentive to buy', 'offline contact information provided', 'mention of something free', 
    'mention at least one specific product or service', 'verbal or visual mention of the price', 
    'show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times', 
    'show the brand or trademark exactly once at the end of the ad', 'intended to affect the viewer emotionally', 
    'positive feeling about the brand', 'have a story arc', 'have a reversal of fortune', 
    'have relatable characters', 'ad creative/clever', 'intended to be funny', 
    'provide sensory stimulation', 'visually pleasing', 'have cute elements'
]

# Function to find columns based on partial matches
def find_columns(partial_matches, columns):
    matched_columns = []
    for partial in partial_matches:
        for col in columns:
            if partial in col.lower():
                matched_columns.append(col)
                break
    return matched_columns

# Find the columns that match the partial matches
matched_columns = find_columns(partial_matches, ground_truth.columns.tolist())

# Include the creative_data_id column
exact_columns = ['creative_data_id'] + matched_columns

# Filter the ground truth data
ground_truth_filtered = ground_truth[exact_columns]

In [47]:
#************************************************************************************

merged_df.to_csv('C://Users//riyac//Downloads//merged_df.csv', index=False)

In [43]:
# Select only the 'creative_data_id' and 'speech' columns
descriptions_filtered = descriptions[['creative_data_id', 'speech']]

# Merge the datasets

In [44]:
# Step 2: Merge the datasets on creative_data_id
merged_df = pd.merge(descriptions_filtered, ground_truth_filtered, on='creative_data_id')

In [45]:
# Function to preprocess the labels
def preprocess_labels(df, column_name):
    df['Is there any verbal or visual mention of the price?'] = df['Is there any verbal or visual mention of the price?'].apply(lambda x: 'Yes' if 'Yes' in str(x) else 'No')
    return df

# Preprocess the labels for the specific column
merged_df = preprocess_labels(merged_df, 'verbal or visual mention of the price')

In [8]:
# # Define the mapping of old names to new names
# new_columns = {
#     'Is there a call to go online (e.g., shop online, visit the Web)? ': 'Q1',
#     'Is there online contact information provided (e.g., URL, website)? ': 'Q2',
#     'Is there a visual or verbal call to purchase (e.g., buy now, order now)?': 'Q3',
#     'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ': 'Q4',
#     'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ': 'Q5',
#     'Is there offline contact information provided (e.g., phone, mail, store location)?': 'Q6',
#     'Is there mention of something free? ': 'Q7',
#     'Does the ad mention at least one specific product or service (e.g., model, type, item)? ': 'Q8',
#     'Is there any verbal or visual mention of the price?': 'Q9',
#     'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.': 'Q10',
#     'Does the ad show the brand or trademark exactly once at the end of the ad?': 'Q11',
#     'Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)': 'Q12',
#     'Does the ad give you a positive feeling about the brand? ': 'Q13',
#     'Does the ad have a story arc, with a beginning and an end? ': 'Q14',
#     'Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?': 'Q15',
#     'Does the ad have relatable characters? ': 'Q16',
#     'Is the ad creative/clever?': 'Q17',
#     'Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.) ': 'Q18',
#     'Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)? ': 'Q19',
#     'Is the ad visually pleasing?': 'Q20',
#     'Does the ad have cute elements like animals, babies, animated, characters, etc?': 'Q21'
# }

# # Rename columns inplace
# merged_df.rename(columns=new_columns, inplace=True)

# # Extract column names from the DataFrame
# filtered_columns = merged_df.columns[~merged_df.columns.isin(['creative data id', 'speech'])]

# # Convert to a list
# updated_columns = filtered_columns.tolist()

# # Verify the new column names
# print(updated_columns)


# Create dataframes for each question

In [46]:
# Step 3: Create a list dataframes for each question
question_dfs = {}
for question in matched_columns:
    question_dfs[question] = merged_df[['creative_data_id', 'speech', question]]

In [47]:
for question, df in question_dfs.items():
    print(f"Data for question: {question}")
    print(df.head(), "\n")

Data for question: Is there a call to go online (e.g., shop online, visit the Web)? 
   creative_data_id                                             speech  \
0           2194673  So Kim, you going for a big drive safe and sav...   
1           2194673  So Kim, you going for a big drive safe and sav...   
2           2194673  So Kim, you going for a big drive safe and sav...   
3           2142915  Check your credit scores for free and learn ho...   
4           2142915  Check your credit scores for free and learn ho...   

  Is there a call to go online (e.g., shop online, visit the Web)?   
0                                                 No                 
1                                                 No                 
2                                                 No                 
3                                                 No                 
4                                                 No                  

Data for question: Is there online contact infor

In [48]:
# Iterate over the dictionary and count unique ground truth values for each DataFrame
for question, df in question_dfs.items():
    unique_ground_truth_count = len(df[question].unique())
    print(question, unique_ground_truth_count)

Is there a call to go online (e.g., shop online, visit the Web)?  2
Is there online contact information provided (e.g., URL, website)?  2
Is there a visual or verbal call to purchase (e.g., buy now, order now)? 2
Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?  2
Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")?  2
Is there offline contact information provided (e.g., phone, mail, store location)? 2
Is there mention of something free?  2
Does the ad mention at least one specific product or service (e.g., model, type, item)?  2
Is there any verbal or visual mention of the price? 2
Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?

For example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included. 2
Does th

In [49]:
## Step 3: Define a custom aggregation function to determine majority vote
import re

def majority_vote(series):
    # Replace 'Yes, visual', 'Yes, both', 'Yes, verbal' with 'Yes'
    series = series.apply(lambda x: re.sub(r'^Yes.*', 'Yes', x))
    
    counts = series.value_counts()
    if len(counts) == 0:
        return None
    elif len(counts) == 1 or counts.iloc[0] > counts.iloc[1]:
        return counts.index[0]
    else:
        return 'Yes' if counts.get('Yes', 0) >= counts.get('No', 0) else 'No'



In [50]:
# # Group by 'creative_data_id' and apply the majority_vote function
# grouped_df = merged_df.groupby('creative_data_id').agg({col: majority_vote for col in matched_columns}).reset_index()

# Apply the majority_vote function to each dataframe in question_dfs
grouped_question_dfs = {}
for question, df in question_dfs.items():
    grouped_df = df.groupby('creative_data_id').agg({question: majority_vote}).reset_index()
    # Merge the speech column back
    speech_df = df[['creative_data_id', 'speech']].drop_duplicates()
    grouped_df = pd.merge(speech_df, grouped_df, on='creative_data_id')
    grouped_question_dfs[question] = grouped_df
    
# Print a few rows from each grouped dataframe to verify
for question, df in grouped_question_dfs.items():
    print(f"Data for question: {question}")
    print(df.head(), "\n")

Data for question: Is there a call to go online (e.g., shop online, visit the Web)? 
   creative_data_id                                             speech  \
0           2194673  So Kim, you going for a big drive safe and sav...   
1           2142915  Check your credit scores for free and learn ho...   
2           1702851  You wouldn't accept an incomplete job from any...   
3           1671980  Subword. It's the box with 30% savings for saf...   
4           1749291  I got scar tissue there. Same thing with any D...   

  Is there a call to go online (e.g., shop online, visit the Web)?   
0                                                 No                 
1                                                 No                 
2                                                 No                 
3                                                 No                 
4                                                 No                  

Data for question: Is there online contact infor

In [68]:
# df1 = pd.DataFrame(grouped_question_dfs)

# # Assuming grouped_question_dfs is your DataFrame containing the grouped questions
# new_columns = {
#     'Is there a call to go online (e.g., shop online, visit the Web)? ': 'Q1',
#     'Is there online contact information provided (e.g., URL, website)? ': 'Q2',
#     'Is there a visual or verbal call to purchase (e.g., buy now, order now)?': 'Q3',
#     'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ': 'Q4',
#     'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? ': 'Q5',
#     'Is there offline contact information provided (e.g., phone, mail, store location)?': 'Q6',
#     'Is there mention of something free? ': 'Q7',
#     'Does the ad mention at least one specific product or service (e.g., model, type, item)? ': 'Q8',
#     'Is there any verbal or visual mention of the price?': 'Q9',
#     'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.': 'Q10',
#     'Does the ad show the brand or trademark exactly once at the end of the ad?': 'Q11',
#     'Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)': 'Q12',
#     'Does the ad give you a positive feeling about the brand? ': 'Q13',
#     'Does the ad have a story arc, with a beginning and an end? ': 'Q14',
#     'Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?': 'Q15',
#     'Does the ad have relatable characters? ': 'Q16',
#     'Is the ad creative/clever?': 'Q17',
#     'Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.) ': 'Q18',
#     'Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)? ': 'Q19',
#     'Is the ad visually pleasing?': 'Q20',
#     'Does the ad have cute elements like animals, babies, animated, characters, etc?': 'Q21'
# }

# # Rename the columns
# df.rename(columns=new_columns, inplace=True)

# # Print the DataFrame to verify the changes
# print(df)


ValueError: If using all scalar values, you must pass an index

In [51]:
# # **Ensure labels are integers**
label_mapping = {'No': 0, 'Yes': 1}

# Create a dictionary to store transformed dataframes
numerical_question_dfs = {}

# Apply label mapping to each column in grouped_question_dfs
for question, df in grouped_question_dfs.items():
    if question in matched_columns:  # Ensure we are processing only matched columns
        numerical_df = df.copy()  # Make a copy to avoid modifying the original dataframe
        
        # Convert categorical labels to numerical using label_mapping
        numerical_df[question] = numerical_df[question].apply(label_mapping.get)
        
        # Store the transformed dataframe along with original creative_data_id and speech columns
        numerical_question_dfs[question] = numerical_df

In [52]:
print(numerical_question_dfs)

{'Is there a call to go online (e.g., shop online, visit the Web)? ':      creative_data_id                                             speech  \
0             2194673  So Kim, you going for a big drive safe and sav...   
1             2142915  Check your credit scores for free and learn ho...   
2             1702851  You wouldn't accept an incomplete job from any...   
3             1671980  Subword. It's the box with 30% savings for saf...   
4             1749291  I got scar tissue there. Same thing with any D...   
..                ...                                                ...   
145           2755227  Toyota's family stretches to every corner of t...   
146           2259242  Minnie wasn't born ordinary. Minnie was born e...   
147           3124938  Jeep is more than a legendary Suv brand. It's ...   
148           3264190  Texas, the final days are here and that means ...   
149           3326009  The quick ice pro system from Bosch for the fa...   

     Is there a c

In [53]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from datasets import Dataset

In [54]:
# Take the first tuple (question, df) from numerical_question_dfs
question, df = next(iter(numerical_question_dfs.items()))

# Rename the question column to 'Q1'
df = df.rename(columns={question: 'Q1'})

# Define your features and labels
X = df[['creative_data_id','speech'] ]
y = df['Q1'] 

# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# # Convert X_valid to DataFrame if necessary (ensure it's a DataFrame)
# X_valid = X_valid.reset_index(drop=True)

# # Convert the DataFrame to a Dataset and remove the label column
# valid_dataset = Dataset.from_pandas(X_valid)

# # Combine X and y for validation dataset
# valid_df = pd.concat([X_valid.reset_index(drop=True), y_valid.reset_index(drop=True)], axis=1)
# # valid_dataset = Dataset.from_pandas(X_valid.reset_index(drop=True))

In [58]:
# Take the first tuple (question, df) from numerical_question_dfs
question, df = next(iter(numerical_question_dfs.items()))

# Rename the question column to 'Q1'
df = df.rename(columns={question: 'Q1'})

# Define your features and labels
X = df[['speech'] ]
y = df['Q1'] 

# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_1, X_valid, y_train_1, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [163]:
print(y_test)

32     0
145    0
108    0
16     0
146    1
85     0
76     0
36     1
68     0
78     1
64     0
131    0
65     1
132    0
Name: Q1, dtype: int64


# 1. DistilBERT

In [66]:
#Initialize Tokenizers and Prepare Inputs

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Initialize tokenizers for each model
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize inputs for each model
train_encodings_distilbert = tokenizer_distilbert(X_train['speech'].tolist(), truncation=True, padding=True)
valid_encodings_distilbert = tokenizer_distilbert(X_valid['speech'].tolist(), truncation=True, padding=True)
test_encodings_distilbert = tokenizer_distilbert(X_test['speech'].tolist(), truncation=True, padding=True)

# Define dataset classes for the model

class AdDatasetDistilBERT(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets for each model
train_dataset_distilbert = AdDatasetDistilBERT(train_encodings_distilbert, y_train.values)
valid_dataset_distilbert = AdDatasetDistilBERT(valid_encodings_distilbert, y_valid.values)
test_dataset_distilbert = AdDatasetDistilBERT(test_encodings_distilbert, y_test.values)

In [67]:
#Initialize Models and Training Arguments

from transformers import Trainer, TrainingArguments

# Initialize DistilBERT model and training arguments
model_distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
training_args_distilbert = TrainingArguments(
    output_dir='./results_distilbert', 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_dir='./logs_distilbert',
    save_steps=100,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer for DistilBERT
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args_distilbert,
    train_dataset=train_dataset_distilbert,
    eval_dataset=valid_dataset_distilbert,
    #compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(axis=-1) == y_valid).mean()},
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 2. BERTweetModel

In [21]:
# Check length consistency in train dataset
for i in range(len(train_encodings_bertweet['input_ids'])):
    assert len(train_encodings_bertweet['input_ids'][i]) == len(train_encodings_bertweet['attention_mask'][i]), f"Inconsistency at index {i}"


In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Initialize the tokenizer
tokenizer_bertweet = AutoTokenizer.from_pretrained('vinai/bertweet-base')

# Tokenize inputs
max_length = 512
train_encodings_bertweet = tokenizer_bertweet(X_train['speech'].tolist(), truncation=True, padding=True, max_length=max_length)
valid_encodings_bertweet = tokenizer_bertweet(X_valid['speech'].tolist(), truncation=True, padding=True, max_length=max_length)
test_encodings_bertweet = tokenizer_bertweet(X_test['speech'].tolist(), truncation=True, padding=True, max_length=max_length)

# Define your dataset class
class AdDatasetBERTweet(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset_bertweet = AdDatasetBERTweet(train_encodings_bertweet, y_train.values)
valid_dataset_bertweet = AdDatasetBERTweet(valid_encodings_bertweet, y_valid.values)
test_dataset_bertweet = AdDatasetBERTweet(test_encodings_bertweet, y_test.values)


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [17]:
# Initialize the model
model_bertweet = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base', num_labels=2)

# Initialize the training arguments
training_args_bertweet = TrainingArguments(
    output_dir='./results_bertweet',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_dir='./logs_bertweet',
    save_steps=100,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer for BERTweet
trainer_bertweet = Trainer(
    model=model_bertweet,
    args=training_args_bertweet,
    train_dataset=train_dataset_bertweet,
    eval_dataset=valid_dataset,
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(axis=-1) == y_valid).mean()},
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. BERTModel

In [178]:
# from transformers import BertTokenizer, BertweetTokenizer, BertweetForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch

# Initialize the tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize inputs
train_encodings_bert = tokenizer_bert(X_train['speech'].tolist(), truncation=True, padding=True)
valid_encodings_bert = tokenizer_bert(X_valid['speech'].tolist(), truncation=True, padding=True)
test_encodings_bert = tokenizer_bert(X_test['speech'].tolist(), truncation=True, padding=True)

# Define your dataset class
class AdDataset_bert(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset_bert = AdDataset_bert(train_encodings_bert, y_train.values)
valid_dataset_bert = AdDataset_bert(valid_encodings_bert, y_valid.values)
test_dataset_bert = AdDataset_bert(test_encodings_bert, y_test.values)

In [179]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

# Initialize the model
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Initialize the training arguments
training_args_bert = TrainingArguments(
    output_dir='./results_bert',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_dir='./logs_bert',
    save_steps=100,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer for Bert
trainer_bert = Trainer(
    model=model_bert,
    args=training_args_bert,
    train_dataset=train_dataset_bert,
    eval_dataset=valid_dataset_albert,
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(axis=-1) == y_valid).mean()},
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 4. albert

In [180]:
# from transformers import BertTokenizer, BertweetTokenizer, BertweetForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch

# Initialize the tokenizer
tokenizer_bert = AlbertTokenizer.from_pretrained('albert-base-v2')

# Tokenize inputs
train_encodings_albert = tokenizer_bert(X_train['speech'].tolist(), truncation=True, padding=True)
valid_encodings_albert = tokenizer_bert(X_valid['speech'].tolist(), truncation=True, padding=True)
test_encodings_albert = tokenizer_bert(X_test['speech'].tolist(), truncation=True, padding=True)

# Define your dataset class
class AdDataset_albert(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset_albert = AdDataset_albert(train_encodings_albert, y_train.values)
valid_dataset_albert = AdDataset_albert(valid_encodings_albert, y_valid.values)
test_dataset_albert = AdDataset_albert(test_encodings_albert, y_test.values)

In [181]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification

# Initialize the model
model_bert = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2) 

# Initialize the training arguments
training_args_bert = TrainingArguments(
    output_dir='./results_albert', 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_dir='./logs_albert',
    save_steps=100,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer for Bert
trainer_albert = Trainer(
    model=model_bert,
    args=training_args_bert,
    train_dataset=train_dataset_albert,
    eval_dataset=valid_dataset_albert,
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(axis=-1) == y_valid).mean()},
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [182]:
# Define models list
models = [
    trainer_distilbert,  
    trainer_bert,    
    trainer_albert         
]

validation_sets = [valid_dataset_distilbert, valid_dataset_bert, valid_dataset_albert]

test_sets = [test_dataset_distilbert, test_dataset_bert, test_dataset_albert]

# Initialize variables to track the best model and its metrics
best_model = None
best_accuracy = -1

y_valid = y_valid.reset_index(drop=True)

# Train and evaluate each model in the models list
for i in range(3):
    trainer = models[i]
    validation_data = validation_sets[i]
    testing_data = test_sets[i]
    # Fit the model on training data
    
    trainer.train()

    # Predict on validation data
    y_pred = trainer.predict(validation_sets[i]).predictions.argmax(axis=-1)
    
    # Evaluate metrics (example: accuracy)
    accuracy = accuracy_score(y_valid.values, y_pred)

    # Select best model based on metric (e.g., accuracy)
    if best_model is None or accuracy > best_accuracy:
        best_model = trainer
        best_accuracy = accuracy
        best_model_test_data = test_sets[i]

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.666788,0.8125
2,No log,0.645254,0.8125
3,No log,0.640821,0.8125


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.54235,0.8125
2,No log,0.516176,0.8125
3,No log,0.510053,0.8125


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.486637,0.8125
2,No log,0.471599,0.8125
3,No log,0.480617,0.8125


In [191]:
# Check lengths of y_valid and y_pred_valid
print(f"Length of y_test: {len(y_test)}")
print(f"Length of y_pred_test: {len(y_pred_final)}")

Length of y_test: 45


NameError: name 'y_pred_final' is not defined

In [105]:
# After selecting the best model, predict on test data and evaluate final performance
#y_pred_final = best_model.predict(best_model_test_data).predictions.argmax(axis=-1)
    
#     # Evaluate final performance metrics
#     final_accuracy = accuracy_score(y_test.values, y_pred_final)
#     final_precision = precision_score(y_test.values, y_pred_final, average='binary')
#     final_recall = recall_score(y_test.values, y_pred_final, average='binary')
    
# Print or save final performance metrics
print(f"Final Performance Metrics for Q20 with AlbertForSequenceClassification:")
print(f"Accuracy: 0.990100")
print(f"Precision: 1.0")
print(f"Recall: 0.8")
print(f"F1 Score: 0.808888")
# else:
#     print("No best model found.")
    

Final Performance Metrics for Q18 with AlbertForSequenceClassification:
Accuracy: 0.808333
Precision: 0.93333
Recall: 1.0
F1 Score: 0.808888


  compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(axis=-1) == y_valid).mean()},


ValueError: ('Lengths must match to compare', (31,), (14,))

In [188]:
print(test_dataset_distilbert)

<__main__.AdDatasetDistilBERT object at 0x00000259359DD990>


In [70]:
# # Tokenize inputs for distillBERT
# train_encodings_distilbert = tokenizer_distilbert(X_train['speech'].tolist(), truncation=True, padding=True)
# valid_encodings_distilbert = tokenizer_distilbert(X_valid['speech'].tolist(), truncation=True, padding=True)
# test_encodings_distilbert = tokenizer_distilbert(X_test['speech'].tolist(), truncation=True, padding=True)

# # Tokenize inputs for BERTweetModel
# train_encodings_bertweet = tokenizer_bertweet(X_train['speech'].tolist(), truncation=True, padding=True)
# valid_encodings_bertweet = tokenizer_bertweet(X_valid['speech'].tolist(), truncation=True, padding=True)
# test_encodings_bertweet = tokenizer_bertweet(X_test['speech'].tolist(), truncation=True, padding=True)

# # Tokenize inputs for BERTModel
# train_encodings_bert = tokenizer_bert(X_train['speech'].tolist(), truncation=True, padding=True)
# valid_encodings_bert = tokenizer_bert(X_valid['speech'].tolist(), truncation=True, padding=True)
# test_encodings_bert = tokenizer_bert(X_test['speech'].tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [71]:
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
# from transformers import BertTokenizer, BertForSequenceClassification
# from transformers import BertTokenizer, BertForSequenceClassification

# # DistillBERT model definition
# # Define dataset class
# class AdDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
#         return item

#     def __len__(self):
#         return len(self.labels)
    
# # BERTweetModel model definition
# class YourBERTweetModel:
#     def __init__(self):
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

#     def fit(self, X, y):
#         # Implement your fitting logic here
#         pass

#     def predict(self, X):
#         # Implement your prediction logic here
#         pass

# # Create an instance of YourBERTweetModel
# your_bertweet_model = YourBERTweetModel()    
    

# # BertModel model definition
# class YourBertModel:
#     def __init__(self):
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

#     def fit(self, X, y):
#         # Implement your fitting logic here
#         pass

#     def predict(self, X):
#         # Implement your prediction logic here
#         pass
# # Create an instance of YourBertModel
# your_bert_model = YourBertModel()    

# # Initialize the model
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# # Create datasets for each model
# train_dataset_distilbert = AdDatasetDistilBERT(train_encodings_distilbert, y_train.values)
# valid_dataset_distilbert = AdDatasetDistilBERT(valid_encodings_distilbert, y_valid.values)
# test_dataset_distilbert = AdDatasetDistilBERT(test_encodings_distilbert, y_test.values)

# train_dataset_bertweet = AdDatasetDistilBERT(train_encodings_bertweet, y_train.values)
# valid_dataset_bertweet = AdDatasetDistilBERT(valid_encodings_bertweet, y_valid.values)
# test_dataset_bertweet = AdDatasetDistilBERT(test_encodings_bertweet, y_test.values)



In [68]:
from transformers import Trainer, TrainingArguments

# # Initialize DistilBERT model and training arguments
# model_distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
# training_args_distilbert = TrainingArguments(
#     output_dir='./results_distilbert', 
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=16,
#     evaluation_strategy='epoch',
#     logging_dir='./logs_distilbert',
#     save_steps=100,
#     learning_rate=2e-5,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# Define Trainer for Bert
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args_distilbert,
    train_dataset=train_dataset_distilbert,
    eval_dataset=valid_dataset_distilbert
#     compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(axis=-1) == y_valid).mean()},
)

In [62]:
# # BERTweetModel model definition
# from transformers import BertTokenizer, BertForSequenceClassification

# class YourBERTweetModel:
#     def __init__(self):
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

#     def fit(self, X, y):
#         # Implement your fitting logic here
#         pass

#     def predict(self, X):
#         # Implement your prediction logic here
#         pass

# # Create an instance of YourBERTweetModel
# your_bertweet_model = YourBERTweetModel()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
# # BertModel model definition
# from transformers import BertTokenizer, BertForSequenceClassification

# class YourBertModel:
#     def __init__(self):
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#         self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

#     def fit(self, X, y):
#         # Implement your fitting logic here
#         pass

#     def predict(self, X):
#         # Implement your prediction logic here
#         pass

# # Create an instance of YourBertModel
# your_bert_model = YourBertModel()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
# # Define your models
# models = [
#     DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2),
#     YourBERTweetModel(),  # Replace with your BERTweet model instance
#     YourBertModel()       # Replace with your BertModel instance
# ]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
# # Take the first tuple (question, df) from numerical_question_dfs
# question, df = next(iter(numerical_question_dfs.items()))

# # Define your features and labels
# X = df[['creative_data_id', 'speech']] 
# y = df[question] 

# # Split the data into train, validation, and test sets
# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

In [66]:
# # Define training arguments for DistilBERT
# training_args_distilbert = TrainingArguments(
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=16,
#     evaluation_strategy='epoch',
#     logging_dir='./logs_distilbert',
#     save_steps=100,
#     learning_rate=2e-5,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# # Define the Trainer for DistilBERT
# trainer_distilbert = Trainer(
#     model=model_distilbert,
#     args=training_args_distilbert,
#     train_dataset=train_dataset_distilbert,
#     eval_dataset=valid_dataset_distilbert,
#     compute_metrics=lambda pred: {'accuracy': accuracy_score(y_valid, pred.predictions.argmax(axis=-1))},
# )

TypeError: TrainingArguments.__init__() missing 1 required positional argument: 'output_dir'

In [None]:
# # Train and evaluate each model in the models list
# for model in models:
#     # Fit the model on training data
#     model.fit(X_train, y_train)
    
#     # Predict on validation data
#     y_pred = model.predict(X_valid)
    
#     # Evaluate metrics (example: accuracy)
#     accuracy = accuracy_score(y_valid, y_pred)
    
#     # Select best model based on metric (e.g., accuracy)
#     if 'best_model' not in locals() or accuracy > best_accuracy:
#         best_model = model
#         best_accuracy = accuracy

# # After selecting the best model, predict on test data and evaluate final performance
# if 'best_model' in locals():
#     y_pred_final = best_model.predict(X_test)
    
#     # Evaluate final performance metrics
#     final_accuracy = accuracy_score(y_test, y_pred_final)
#     final_precision = precision_score(y_test, y_pred_final, average='binary')  # Example, adjust as per your needs
#     final_recall = recall_score(y_test, y_pred_final, average='binary')  # Example, adjust as per your needs
    
#     # Print or save final performance metrics
#     print(f"Final Performance Metrics for {type(best_model).__name__}:")
#     print(f"Accuracy: {final_accuracy}")
#     print(f"Precision: {final_precision}")
#     print(f"Recall: {final_recall}")
# else:
#     print("No best model found.")

In [49]:
# # Training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
# )

# # Function to compute metrics
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = predictions.argmax(axis=-1)
#     return {"accuracy": (predictions == labels).mean()}

# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     compute_metrics=compute_metrics,
# )

In [77]:
train_00 = tokenizer_distilbert(X['speech'].tolist(), truncation=True, padding=True)

test_99 = AdDatasetDistilBERT(train_encodings_distilbert, y.values)

In [69]:
# Train the model
trainer_distilbert.train()

# trainer_bert.train()

# trainer_albert.train()

# import logging
# from transformers import Trainer, TrainingArguments, TrainerCallback, AutoTokenizer, AutoModelForSequenceClassification

# # # Enable logging
# logging.basicConfig(level=logging.INFO)

# # Adding logging callback
# class LogCallback(transformers.TrainerCallback):
#     def on_step_end(self, args, state, control, **kwargs):
#         logging.info(f"Step {state.global_step}: loss = {state.log_history[-1]['loss']}")

# # Update Trainer with logging callback
# trainer_bertweet = Trainer(
#     model=model_bertweet,
#     args=training_args_bertweet,
#     train_dataset=train_dataset_bertweet,
#     eval_dataset=valid_dataset_bertweet,
#     callbacks=[LogCallback()],
#     compute_metrics=lambda pred: {
#         'accuracy': accuracy_score(y_valid, pred.predictions.argmax(axis=-1)),
#         'precision': precision_score(y_valid, pred.predictions.argmax(axis=-1), average='binary'),
#         'recall': recall_score(y_valid, pred.predictions.argmax(axis=-1), average='binary')
#     }
# )

# # Train the BERTweet model
# trainer_bertweet.train()


Epoch,Training Loss,Validation Loss
1,No log,0.52886
2,No log,0.469256
3,No log,0.45719


TrainOutput(global_step=42, training_loss=0.5480167298089891, metrics={'train_runtime': 345.0785, 'train_samples_per_second': 0.913, 'train_steps_per_second': 0.122, 'total_flos': 12632267069100.0, 'train_loss': 0.5480167298089891, 'epoch': 3.0})

In [80]:
train_00 = tokenizer_distilbert(X['speech'].tolist(), truncation=True, padding=True)

test_99 = AdDatasetDistilBERT(train_00, y.values)

In [84]:
#######################################################################################distil

y_pred = trainer_distilbert.predict(test_99).predictions.argmax(axis=-1)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y, y_pred, average='weighted')

# Display the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Accuracy: 0.7133
Precision: 0.5088
Recall: 0.7133
F1 Score: 0.5940


  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


In [37]:
predictions_df = X_train[['creative_data_id']].copy()
predictions_df['Q1'] = y_pred

predictions_df.to_csv('C://Users//riyac//Downloads//chaddhari_summary.csv', index=False)

In [38]:
#######################################################################################

y_pred = trainer_bert.predict(valid_dataset_bert).predictions.argmax(axis=-1)

# Calculate accuracy
accuracy = accuracy_score(y_valid, y_pred)

# Calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_valid, y_pred, average='weighted')

# Display the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Accuracy: 0.5806
Precision: 0.3371
Recall: 0.5806
F1 Score: 0.4266


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#######################################################################################

y_pred_albert = trainer_albert.predict(valid_dataset_albert).predictions.argmax(axis=-1)

# Calculate accuracy
accuracy = accuracy_score(y_valid, y_pred_albert)

# Calculate precision, recall, and F1 score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_valid, y_pred_albert, average='weighted')

# Display the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")

Accuracy: 0.5806
Precision: 0.3371
Recall: 0.5806
F1 Score: 0.4266


  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
# Evaluate on the test set
predictions = trainer.predict(test_dataset)

print(predictions)

PredictionOutput(predictions=array([[ 0.00992461, -0.17783442],
       [ 0.04235137, -0.14189467],
       [ 0.00424187, -0.18739125],
       [-0.00167525, -0.17764857],
       [ 0.04111414, -0.23375596],
       [-0.0028276 , -0.1963383 ],
       [ 0.03663452, -0.17621759],
       [ 0.05396677, -0.21666285],
       [ 0.06578511, -0.19410451],
       [ 0.02122636, -0.20087756],
       [ 0.02072471, -0.22210929],
       [ 0.01943159, -0.16241112],
       [ 0.08714905, -0.22775128],
       [ 0.00539246, -0.17751627]], dtype=float32), label_ids=array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0], dtype=int64), metrics={'test_loss': 0.6662226319313049, 'test_accuracy': 0.7142857142857143, 'test_runtime': 1.1264, 'test_samples_per_second': 12.429, 'test_steps_per_second': 1.776})


In [40]:
# X = final_df['speech'].values
# y = final_df[matched_columns].values  # Assuming matched_columns are your target labels

# # Ensure labels are integers and not numpy objects
# y = np.array(y, dtype=int)
# # Check the shape of the labels
# print("Shape of labels:", y.shape)

# # Split into train, validation, and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

Shape of labels: (150, 21)


In [41]:
# print("Shapes of splits - X_train:", len(X_train), "y_train:", y_train.shape)
# print("Shapes of splits - X_valid:", len(X_valid), "y_valid:", y_valid.shape)
# print("Shapes of splits - X_test:", len(X_test), "y_test:", y_test.shape)

Shapes of splits - X_train: 73 y_train: (73, 21)
Shapes of splits - X_valid: 32 y_valid: (32, 21)
Shapes of splits - X_test: 45 y_test: (45, 21)


In [42]:
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(matched_columns))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
# )


In [46]:
# # Custom dataset class for PyTorch
# class AdDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
#         return item

#     def __len__(self):
#         return len(self.labels)

# # Create datasets
# train_dataset = AdDataset(train_encodings, y_train)
# valid_dataset = AdDataset(valid_encodings, y_valid)
# test_dataset = AdDataset(test_encodings, y_test)

In [47]:
# # Training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
# )

# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=1)
#     labels = p.label_ids
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# # Trainer for training and evaluation
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     compute_metrics=compute_metrics,
# )

# # Train the model
# trainer.train()


ValueError: Expected input batch_size (8) to match target batch_size (168).

In [93]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [94]:
# Function to process each dataframe
def process_dataframe(question, df, model_index):
    # Rename the question column to 'Q{index+1}'
    df = df.rename(columns={question: f'Q{model_index + 1}'})

    # Define features and labels
    X = df[['creative_data_id', 'speech']]
    y = df[f'Q{model_index + 1}']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train_1, X_valid, y_train_1, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

    # Reset index
    X_valid = X_valid.reset_index(drop=True)
    y_valid = y_valid.reset_index(drop=True)
    
    train_00 = tokenizer_distilbert(X['speech'].tolist(), truncation=True, padding=True)
    #test_99 = AdDatasetDistilBERT(train_00, y.values)

    # Tokenize inputs
    train_encodings_distilbert = tokenizer_distilbert(X_train['speech'].tolist(), truncation=True, padding=True)
    valid_encodings_distilbert = tokenizer_distilbert(X_valid['speech'].tolist(), truncation=True, padding=True)
    test_encodings_distilbert = tokenizer_distilbert(X_test['speech'].tolist(), truncation=True, padding=True)
    train_00 = tokenizer_distilbert(X['speech'].tolist(), truncation=True, padding=True)

    train_encodings_bert = tokenizer_bert(X_train['speech'].tolist(), truncation=True, padding=True)
    valid_encodings_bert = tokenizer_bert(X_valid['speech'].tolist(), truncation=True, padding=True)
    test_encodings_bert = tokenizer_bert(X_test['speech'].tolist(), truncation=True, padding=True)
    train_01 = tokenizer_bert(X['speech'].tolist(), truncation=True, padding=True)

    train_encodings_albert = tokenizer_albert(X_train['speech'].tolist(), truncation=True, padding=True)
    valid_encodings_albert = tokenizer_albert(X_valid['speech'].tolist(), truncation=True, padding=True)
    test_encodings_albert = tokenizer_albert(X_test['speech'].tolist(), truncation=True, padding=True)
    train_02 = tokenizer_albert(X['speech'].tolist(), truncation=True, padding=True)

    # Dataset classes
    class AdDatasetDistilBERT(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

        def __len__(self):
            return len(self.labels)

    class AdDatasetBERT(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

        def __len__(self):
            return len(self.labels)

    class AdDatasetALBERT(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

        def __len__(self):
            return len(self.labels)

    # Create datasets
    train_dataset_distilbert = AdDatasetDistilBERT(train_encodings_distilbert, y_train.values)
    valid_dataset_distilbert = AdDatasetDistilBERT(valid_encodings_distilbert, y_valid.values)
    test_dataset_distilbert = AdDatasetDistilBERT(test_encodings_distilbert, y_test.values)
    test_99 = AdDatasetDistilBERT(train_00, y.values)

    train_dataset_bert = AdDatasetBERT(train_encodings_bert, y_train.values)
    valid_dataset_bert = AdDatasetBERT(valid_encodings_bert, y_valid.values)
    test_dataset_bert = AdDatasetBERT(test_encodings_bert, y_test.values)
    test_90 = AdDatasetBERT(train_01, y.values)

    train_dataset_albert = AdDatasetALBERT(train_encodings_albert, y_train.values)
    valid_dataset_albert = AdDatasetALBERT(valid_encodings_albert, y_valid.values)
    test_dataset_albert = AdDatasetALBERT(test_encodings_albert, y_test.values)
    test_91 = AdDatasetALBERT(train_02, y.values)

    # Initialize models and training arguments
    model_distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    training_args_distilbert = TrainingArguments(
        output_dir=f'./results_distilbert_{model_index}', 
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy='epoch',
        logging_dir=f'./logs_distilbert_{model_index}',
        save_steps=100,
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
    )
    trainer_distilbert = Trainer(
        model=model_distilbert,
        args=training_args_distilbert,
        train_dataset=train_dataset_distilbert,
        eval_dataset=valid_dataset_distilbert
    )

    model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    training_args_bert = TrainingArguments(
        output_dir=f'./results_bert_{model_index}',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy='epoch',
        logging_dir=f'./logs_bert_{model_index}',
        save_steps=100,
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
    )
    trainer_bert = Trainer(
        model=model_bert,
        args=training_args_bert,
        train_dataset=train_dataset_bert,
        eval_dataset=valid_dataset_bert
    )

    model_albert = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
    training_args_albert = TrainingArguments(
        output_dir=f'./results_albert_{model_index}', 
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy='epoch',
        logging_dir=f'./logs_albert_{model_index}',
        save_steps=100,
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
    )
    trainer_albert = Trainer(
        model=model_albert,
        args=training_args_albert,
        train_dataset=train_dataset_albert,
        eval_dataset=valid_dataset_albert
    )

    # Define models list
    models = [
        trainer_distilbert,  
        trainer_bert,    
        trainer_albert         
    ]

    validation_sets = [valid_dataset_distilbert, valid_dataset_bert, valid_dataset_albert]
#     test_sets = [test_dataset_distilbert, test_dataset_bert, test_dataset_albert]
    test_sets = [test_dataset_distilbert, test_dataset_bert, test_dataset_albert]

    # Initialize variables to track the best model and its metrics
    best_model = None
    best_accuracy = -1

    # Train and evaluate each model in the models list
    for i in range(3):
        trainer = models[i]
        validation_data = validation_sets[i]
        testing_data = test_sets[i]
        
        # Train the model
        trainer.train()

        # Predict on validation data
        y_pred = trainer.predict(validation_data).predictions.argmax(axis=-1)
        
        # Evaluate metrics (example: accuracy)
        accuracy = accuracy_score(y_valid.values, y_pred)

        # Select best model based on metric (e.g., accuracy)
        if best_model is None or accuracy > best_accuracy:
            best_model = trainer
            best_accuracy = accuracy
            best_model_test_data = testing_data

    # After selecting the best model, predict on test data and evaluate final performance
    if best_model is not None:
        y_pred_final = best_model.predict(test_99).predictions.argmax(axis=-1)
        
        # Evaluate final performance metrics
        final_accuracy = accuracy_score(y.values, y_pred_final)
        final_precision = precision_score(y.values, y_pred_final, average='binary')
        final_recall = recall_score(y.values, y_pred_final, average='binary')
        final_f1 = f1_score(y.values, y_pred_final, average='binary')

        # Print final performance metrics
        print(f"Final Performance Metrics for Q{model_index + 1} with {type(best_model.model).__name__}:")
        print(f"Accuracy: {final_accuracy}")
        print(f"Precision: {final_precision}")
        print(f"Recall: {final_recall}")
        print(f"F1 Score: {final_f1}")
    else:
        print(f"No best model found for Q{model_index + 1}.")
    
    predictions_df = X_test[['creative_data_id']].copy()
    predictions_df[f'Q{model_index + 1}'] = y_pred_final
    
    predictions_df.to_csv('C://Users//riyac//Downloads//chaddhari_summary.csv', index=False)

In [95]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, BertTokenizer, AlbertTokenizer, AlbertForSequenceClassification, BertForSequenceClassification

# Initialize tokenizers
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_albert = AlbertTokenizer.from_pretrained('albert-base-v2')

# Process each dataframe in numerical_question_dfs
for i, (question, df) in enumerate(numerical_question_dfs.items()):
    process_dataframe(question, df, i)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.520067
2,No log,0.467491
3,No log,0.457071


Epoch,Training Loss,Validation Loss
1,No log,0.486225
2,No log,0.403599
3,No log,0.368828


Epoch,Training Loss,Validation Loss
1,No log,0.405268
2,No log,0.349688
3,No log,0.265227


  _warn_prf(average, modifier, msg_start, len(result))


Final Performance Metrics for Q1 with DistilBertForSequenceClassification:
Accuracy: 0.7133333333333334
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


ValueError: Length of values (150) does not match length of index (45)