In [None]:
import pandas as pd

# Load the CSV file
file_path = 'generated_questions_product.csv'
data = pd.read_csv(file_path)

# Function to split multiple questions in `user_content` into separate rows
def split_user_content(row):
    questions = row['user_content'].split('\n')
    return [{'user_content': question.strip(), 'assistant_content': row['assistant_content'], 'is_liked': row['is_liked']} for question in questions if question.strip()]

# Apply the function to each row and flatten the resulting list of lists
expanded_data = [split_user_content(row) for _, row in data.iterrows()]
expanded_data = [item for sublist in expanded_data for item in sublist]

# Convert back to DataFrame
expanded_df = pd.DataFrame(expanded_data)

# Save the expanded DataFrame to a new CSV file
expanded_file_path = 'expanded_generated_questions_product.csv'
expanded_df.to_csv(expanded_file_path, index=False)

print(f"Expanded dataset saved to {expanded_file_path}")

Expanded dataset saved to expanded_generated_questions_product.csv


In [None]:
# Ensure you have necessary packages installed
!pip install transformers pandas scikit-learn datasets

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Load the CSV file
file_path = 'expanded_generated_questions_product.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
print(data.head())
print(data.info())

# Aggregate data: Combine user_content for the same assistant_content and is_liked
aggregated_data = data.groupby(['assistant_content', 'is_liked'])['user_content'].apply(lambda x: ' '.join(x)).reset_index()

# Load the pretrained BERT model
model_name = "bert-base-uncased"  # Placeholder model name, replace with "ollama/lama3" if available
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Function to evaluate the model
def evaluate_model(data, model, tokenizer, label_column='is_liked', text_column='assistant_content'):
    data = data.dropna(subset=[label_column])
    texts = data[text_column].tolist()
    labels = data[label_column].apply(lambda x: 1 if x == 'True' else 0).tolist()

    predictions = []
    for text in texts:
        result = model(text)
        predictions.append(1 if result[0]['label'] == 'LABEL_1' else 0)

    accuracy = accuracy_score(labels, predictions)
    return accuracy

# Evaluate pretrained model
pretrained_accuracy = evaluate_model(aggregated_data, classifier, tokenizer)
print(f"Pretrained Model Accuracy: {pretrained_accuracy}")

# Split data for training custom model
train_data, test_data = train_test_split(aggregated_data.dropna(subset=['is_liked']), test_size=0.2, random_state=42)

# Function to train custom model (simplified example for demonstration purposes)
def train_custom_model(train_data, model_name="bert-base-uncased"):  # Placeholder model name, replace with "ollama/lama3" if available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Training loop (simplified for demonstration)
    from transformers import Trainer, TrainingArguments
    from datasets import Dataset

    # Prepare dataset
    train_texts = train_data['assistant_content'].tolist()
    train_labels = train_data['is_liked'].apply(lambda x: 1 if x == 'True' else 0).tolist()
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()
    return model

# Train custom model
custom_model = train_custom_model(train_data)

# Evaluate custom model
custom_model_accuracy = evaluate_model(test_data, classifier, tokenizer)
print(f"Custom Model Accuracy: {custom_model_accuracy}")

# Determine minimum data needed for accuracy
def minimum_data_for_accuracy(data, model, tokenizer, target_accuracy=0.8):
    for i in range(1, len(data)+1):
        subset = data.sample(n=i, random_state=42)
        accuracy = evaluate_model(subset, model, tokenizer)
        if accuracy >= target_accuracy:
            return i, accuracy
    return len(data), accuracy

min_data_needed, min_data_accuracy = minimum_data_for_accuracy(train_data, classifier, tokenizer)

# Print results
results = {
    'Pretrained Model Accuracy': pretrained_accuracy,
    'Custom Model Accuracy': custom_model_accuracy,
    'Minimum Data Needed for Target Accuracy': min_data_needed,
    'Achieved Accuracy with Minimum Data': min_data_accuracy
}

print(results)


In [None]:
# Ensure you have necessary packages installed
!pip install transformers pandas scikit-learn datasets

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Load the CSV file
file_path = 'expanded_generated_questions_product.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
print(data.head())
print(data.info())

# Aggregate data: Combine user_content for the same assistant_content and is_liked
aggregated_data = data.groupby(['assistant_content', 'is_liked'])['user_content'].apply(lambda x: ' '.join(x)).reset_index()

# Load the pretrained BERT model
model_name = "bert-base-uncased"  # Placeholder model name, replace with "ollama/lama3" if available
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Function to evaluate the model
def evaluate_model(data, classifier, label_column='is_liked', text_column='assistant_content'):
    data = data.dropna(subset=[label_column])
    texts = data[text_column].tolist()
    labels = data[label_column].apply(lambda x: 1 if x == 'True' else 0).tolist()

    predictions = []
    for text in texts:
        result = classifier(text)
        predictions.append(1 if result[0]['label'] == 'LABEL_1' else 0)

    accuracy = accuracy_score(labels, predictions)
    return accuracy

# Evaluate pretrained model
pretrained_accuracy = evaluate_model(aggregated_data, classifier, label_column='is_liked', text_column='assistant_content')
print(f"Pretrained Model Accuracy: {pretrained_accuracy}")

# Split data for training custom model
train_data, test_data = train_test_split(aggregated_data.dropna(subset=['is_liked']), test_size=0.2, random_state=42)

# Function to train custom model (simplified example for demonstration purposes)
def train_custom_model(train_data, model_name="bert-base-uncased"):  # Placeholder model name, replace with "ollama/lama3" if available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Training loop (simplified for demonstration)
    from transformers import Trainer, TrainingArguments
    from datasets import Dataset

    # Prepare dataset
    train_texts = train_data['assistant_content'].tolist()
    train_labels = train_data['is_liked'].apply(lambda x: 1 if x == 'True' else 0).tolist()
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()
    return model

# Train custom model
custom_model = train_custom_model(train_data)

# Evaluate custom model
custom_classifier = pipeline("text-classification", model=custom_model, tokenizer=tokenizer)
custom_model_accuracy = evaluate_model(test_data, custom_classifier, label_column='is_liked', text_column='assistant_content')
print(f"Custom Model Accuracy: {custom_model_accuracy}")

# Determine minimum data needed for accuracy
def minimum_data_for_accuracy(data, classifier, target_accuracy=0.8, label_column='is_liked', text_column='assistant_content'):
    for i in range(1, len(data)+1):
        subset = data.sample(n=i, random_state=42)
        accuracy = evaluate_model(subset, classifier, label_column=label_column, text_column=text_column)
        if accuracy >= target_accuracy:
            return i, accuracy
    return len(data), accuracy

min_data_needed, min_data_accuracy = minimum_data_for_accuracy(train_data, custom_classifier)

# Print results
results = {
    'Pretrained Model Accuracy': pretrained_accuracy,
    'Custom Model Accuracy': custom_model_accuracy,
    'Minimum Data Needed for Target Accuracy': min_data_needed,
    'Achieved Accuracy with Minimum Data': min_data_accuracy
}

print(results)


                                        user_content  \
0  1. Why was the Product Inventory Operation Att...   
1  2. Can you explain the rationale behind leavin...   
2  3. Was the omission of Product Inventory Opera...   
3  4. What factors led to the exclusion of Produc...   
4  5. Can you provide insight into the reason beh...   

                                   assistant_content is_liked  
0  I apologize for the mistake! You are correct, ...      NaN  
1  I apologize for the mistake! You are correct, ...      NaN  
2  I apologize for the mistake! You are correct, ...      NaN  
3  I apologize for the mistake! You are correct, ...      NaN  
4  I apologize for the mistake! You are correct, ...      NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_content       481 non-null    object
 1   assistant_content  481 non-nu

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pretrained Model Accuracy: 0.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Custom Model Accuracy: 0.3333333333333333
{'Pretrained Model Accuracy': 0.0, 'Custom Model Accuracy': 0.3333333333333333, 'Minimum Data Needed for Target Accuracy': 24, 'Achieved Accuracy with Minimum Data': 0.041666666666666664}


In [None]:
!pip install --upgrade pyarrow datasets



In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
data = pd.read_csv('expanded_generated_questions_product.csv')

# Multiply numerical columns by 2
modified_data = data.select_dtypes(include='number') * 2

# Replace the original numerical columns with the modified values
data.update(modified_data)

# Save the updated dataframe to a new CSV file
data.to_csv('modified_data.csv', index=False)

In [None]:
# Ensure you have necessary packages installed
!pip install transformers pandas scikit-learn datasets

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Load the CSV file
file_path = 'expanded_generated_questions_product.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
print(data.head())
print(data.info())

# Aggregate data: Combine user_content for the same assistant_content and is_liked
aggregated_data = data.groupby(['assistant_content', 'is_liked'])['user_content'].apply(lambda x: ' '.join(x)).reset_index()

# Load the pretrained BERT model
model_name = "bert-base-uncased"  # Placeholder model name, replace with "ollama/lama3" if available
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Function to evaluate the model
def evaluate_model(data, classifier, label_column='is_liked', text_column='assistant_content'):
    data = data.dropna(subset=[label_column])
    texts = data[text_column].tolist()
    labels = data[label_column].apply(lambda x: 1 if x == 'True' else 0).tolist()

    predictions = []
    for text in texts:
        result = classifier(text)
        predictions.append(1 if result[0]['label'] == 'LABEL_1' else 0)

    accuracy = accuracy_score(labels, predictions)
    return accuracy

# Evaluate pretrained model
pretrained_accuracy = evaluate_model(aggregated_data, classifier, label_column='is_liked', text_column='assistant_content')
print(f"Pretrained Model Accuracy: {pretrained_accuracy}")

# Split data for training custom model
train_data, test_data = train_test_split(aggregated_data.dropna(subset=['is_liked']), test_size=0.2, random_state=42)

# Function to train custom model (simplified example for demonstration purposes)
def train_custom_model(train_data, model_name="bert-base-uncased"):  # Placeholder model name, replace with "ollama/lama3" if available
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Training loop (simplified for demonstration)
    from transformers import Trainer, TrainingArguments
    from datasets import Dataset

    # Prepare dataset
    train_texts = train_data['assistant_content'].tolist()
    train_labels = train_data['is_liked'].apply(lambda x: 1 if x == 'True' else 0).tolist()
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()
    return model

# Train custom model
custom_model = train_custom_model(train_data)

# Evaluate custom model
custom_classifier = pipeline("text-classification", model=custom_model, tokenizer=tokenizer)
custom_model_accuracy = evaluate_model(test_data, custom_classifier, label_column='is_liked', text_column='assistant_content')
print(f"Custom Model Accuracy: {custom_model_accuracy}")

# Determine minimum data needed for accuracy
def minimum_data_for_accuracy(data, classifier, target_accuracy=0.8, label_column='is_liked', text_column='assistant_content'):
    for i in range(1, len(data)+1):
        subset = data.sample(n=i, random_state=42)
        accuracy = evaluate_model(subset, classifier, label_column=label_column, text_column=text_column)
        if accuracy >= target_accuracy:
            return i, accuracy
    return len(data), accuracy

min_data_needed, min_data_accuracy = minimum_data_for_accuracy(train_data, custom_classifier)

# Print results
results = {
    'Pretrained Model Accuracy': pretrained_accuracy,
    'Custom Model Accuracy': custom_model_accuracy,
    'Minimum Data Needed for Target Accuracy': min_data_needed,
    'Achieved Accuracy with Minimum Data': min_data_accuracy
}

print(results)


                                        user_content  \
0  1. Why was the Product Inventory Operation Att...   
1  2. Can you explain the rationale behind leavin...   
2  3. Was the omission of Product Inventory Opera...   
3  4. What factors led to the exclusion of Produc...   
4  5. Can you provide insight into the reason beh...   

                                   assistant_content is_liked  
0  I apologize for the mistake! You are correct, ...      NaN  
1  I apologize for the mistake! You are correct, ...      NaN  
2  I apologize for the mistake! You are correct, ...      NaN  
3  I apologize for the mistake! You are correct, ...      NaN  
4  I apologize for the mistake! You are correct, ...      NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_content       962 non-null    object
 1   assistant_content  962 non-nu

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pretrained Model Accuracy: 1.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Custom Model Accuracy: 0.5
{'Pretrained Model Accuracy': 1.0, 'Custom Model Accuracy': 0.5, 'Minimum Data Needed for Target Accuracy': 1, 'Achieved Accuracy with Minimum Data': 1.0}
