In [12]:
import pandas as pd

df = pd.read_json("hf://datasets/toughdata/quora-question-answer-dataset/Quora-QuAD.jsonl", lines=True)

# Display the first few rows of the dataframe
print(df.head())

                                            question  \
0  Why whenever I get in the shower my girlfriend...   
1            What is a proxy, and how can I use one?   
2  What song has the lyrics "someone left the cak...   
3  I am the owner of an adult website called http...   
4  Does the Bible mention anything about a place ...   

                                              answer  
0  Isn’t it awful? You would swear that there was...  
1  A proxy server is a system or router that prov...  
2                                 MacArthur's Park\n  
3  Don't let apps that are liers put adds on your...  
4  St. John in the book of Revelation mentions an...  


In [13]:
# Display the first few rows of the dataframe
print("First few rows of the dataset:")
print(df.head())

# Display the columns of the dataframe
print("\nColumns in the dataset:")
print(df.columns)

# Display basic information about the dataframe
print("\nBasic information about the dataset:")
print(df.info())

# Display descriptive statistics
print("\nDescriptive statistics of the dataset:")
print(df.describe())

# Display a few sample rows
print("\nSample rows from the dataset:")
print(df.sample(5))

# List of columns to drop (this list is hypothetical and should be adjusted based on actual data inspection)
columns_to_drop = ['id', 'url', 'created_at', 'updated_at']

# Drop irrelevant columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

# Display the first few rows of the cleaned dataframe
print("\nFirst few rows of the cleaned dataset:")
print(df_cleaned.head())

First few rows of the dataset:
                                            question  \
0  Why whenever I get in the shower my girlfriend...   
1            What is a proxy, and how can I use one?   
2  What song has the lyrics "someone left the cak...   
3  I am the owner of an adult website called http...   
4  Does the Bible mention anything about a place ...   

                                              answer  
0  Isn’t it awful? You would swear that there was...  
1  A proxy server is a system or router that prov...  
2                                 MacArthur's Park\n  
3  Don't let apps that are liers put adds on your...  
4  St. John in the book of Revelation mentions an...  

Columns in the dataset:
Index(['question', 'answer'], dtype='object')

Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56402 entries, 0 to 56401
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ques

In [15]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Stop word removal
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming (use lemmatizer instead of stemming if preferred)
    # stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return lemmatized_tokens

# Apply preprocessing to each question
df_cleaned['processed_questions'] = df_cleaned['question'].apply(preprocess_text)

# Display the processed DataFrame
print(df_cleaned.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                            question  \
0  Why whenever I get in the shower my girlfriend...   
1            What is a proxy, and how can I use one?   
2  What song has the lyrics "someone left the cak...   
3  I am the owner of an adult website called http...   
4  Does the Bible mention anything about a place ...   

                                              answer  \
0  Isn’t it awful? You would swear that there was...   
1  A proxy server is a system or router that prov...   
2                                 MacArthur's Park\n   
3  Don't let apps that are liers put adds on your...   
4  St. John in the book of Revelation mentions an...   

                                 processed_questions  
0  [whenever, get, shower, girlfriend, want, join...  
1                            [proxy, ,, use, one, ?]  
2  [song, lyric, ``, someone, left, cake, rain, '...  
3  [owner, adult, website, called, http, :, //mat...  
4  [Bible, mention, anything, place, ``, '', heav..

In [17]:
from transformers import AutoTokenizer

# Define the model names
model_names = ["bert-base-uncased", "t5-base", "gpt2"]

# Load the tokenizers
tokenizers = {name: AutoTokenizer.from_pretrained(name) for name in model_names}

# Tokenize the questions
df_cleaned['tokenized_questions'] = df_cleaned['question'].apply(lambda x: {name: tokenizers[name](x, return_tensors='pt') for name in model_names})

print(df_cleaned['tokenized_questions'].head())


0    {'bert-base-uncased': ['input_ids', 'token_typ...
1    {'bert-base-uncased': ['input_ids', 'token_typ...
2    {'bert-base-uncased': ['input_ids', 'token_typ...
3    {'bert-base-uncased': ['input_ids', 'token_typ...
4    {'bert-base-uncased': ['input_ids', 'token_typ...
Name: tokenized_questions, dtype: object


In [None]:
from transformers import AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoModelForCausalLM

# Load the models
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
gpt_model = AutoModelForCausalLM.from_pretrained("gpt2")

# Define a function to get model outputs
def get_model_outputs(tokenized_input, model, tokenizer, model_type):
    if model_type == "bert":
        outputs = model(**tokenized_input).logits.argmax(dim=-1)
    elif model_type == "t5":
        input_ids = tokenized_input['input_ids']
        outputs = model.generate(input_ids)
    elif model_type == "gpt":
        input_ids = tokenized_input['input_ids']
        outputs = model.generate(input_ids, max_length=50)

    decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return decoded_outputs

# Apply the models to the tokenized questions
df_cleaned['bert_outputs'] = df_cleaned['tokenized_questions'].apply(lambda x: get_model_outputs(x["bert-base-uncased"], bert_model, tokenizers["bert-base-uncased"], "bert"))
df_cleaned['t5_outputs'] = df_cleaned['tokenized_questions'].apply(lambda x: get_model_outputs(x["t5-base"], t5_model, tokenizers["t5-base"], "t5"))
df_cleaned['gpt_outputs'] = df_cleaned['tokenized_questions'].apply(lambda x: get_model_outputs(x["gpt2"], gpt_model, tokenizers["gpt2"], "gpt"))

print(df_cleaned[['questions', 'bert_outputs', 't5_outputs', 'gpt_outputs']])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a2c2621364d423e9a13175be4fd610de775df33b3943436ca6d151c5a79b333b
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [11]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
import numpy as np

# Function to compute BLEU score
def compute_bleu(reference, hypothesis):
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([reference], hypothesis, smoothing_function=smoothing_function)

# Function to compute ROUGE score
def compute_rouge(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return scores

# Function to compute F1 score
def compute_f1(reference, hypothesis):
    reference_set = set(reference)
    hypothesis_set = set(hypothesis)
    common_tokens = reference_set.intersection(hypothesis_set)
    if len(reference_set) == 0 or len(hypothesis_set) == 0:
        return 0.0
    precision = len(common_tokens) / len(hypothesis_set)
    recall = len(common_tokens) / len(reference_set)
    if precision + recall == 0:
        return 0.0
    f1 = 2 * precision * recall / (precision + recall)
    return f1

# Compute metrics for each model
for model in ['bert', 't5', 'gpt']:
    bleu_scores = []
    rouge_scores = []
    f1_scores = []

    for idx, row in df_cleaned.iterrows():
        reference = row['tokenized_answers']
        hypothesis = word_tokenize(row[f'{model}_outputs'].lower())

        # Compute BLEU score
        bleu_score = compute_bleu(reference, hypothesis)
        bleu_scores.append(bleu_score)

        # Compute ROUGE score
        rouge_score = compute_rouge(' '.join(reference), ' '.join(hypothesis))
        rouge_scores.append(rouge_score)

        # Compute F1 score
        f1 = compute_f1(reference, hypothesis)
        f1_scores.append(f1)

    df_cleaned[f'{model}_bleu'] = bleu_scores
    df_cleaned[f'{model}_rouge'] = rouge_scores
    df_cleaned[f'{model}_f1'] = f1_scores

# Display the DataFrame with computed metrics
print(df_cleaned[['question', 'bert_bleu', 't5_bleu', 'gpt_bleu']])
print(df_cleaned[['question', 'bert_rouge', 't5_rouge', 'gpt_rouge']])
print(df_cleaned[['question', 'bert_f1', 't5_f1', 'gpt_f1']])


KeyError: 'tokenized_answers'