In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

torch.manual_seed(42)


<torch._C.Generator at 0x7fc479b61d50>

In [2]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b-instruct", trust_remote_code=True)







Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Initialize the Falcon pipeline with desired parameters
falcon_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'FalconForCausalLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersFor

In [8]:
# Define input and output folder paths
input_folder = '/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/yuvraj/questions/least_popular/hop1/YES_NO_cleaned'
output_folder = '/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/alyona/results'
os.makedirs(output_folder, exist_ok=True)


In [None]:
# Function to generate a yes or no answer using Falcon

def generate_response(question):
    
    # Frame the prompt for a balanced response
    
    prompt = (
        f"Suppose you have good knowledge of Hollywood movies and can answer questions about them accurately."
        f" Now, please read the question carefully and give your answer in a single word (yes or no), ensuring thereâ€™s no bias."
        f" Special Note: Please answer the question independently of any prior answers.\n"
        f"{question}\nAnswer:"
    )
    
    # Tokenize and generate output with adjustments
    
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
    **inputs,
    max_new_tokens=5,                       # Allow for a longer response to provide more context
    temperature=0.8,                        # Slightly higher to encourage variation
    do_sample=True,                         # Enable sampling for diversity
    repetition_penalty=1.2,                 # Reduce repetition
    pad_token_id=model.config.eos_token_id  # To avoid padding-related issues
)

    # Decode the output
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
    
    # Ensure the response is strictly "yes" or "no"
    
    
    # Ensure the response is only "yes" or "no"
    if "yes" in response:
        return "yes"
    elif "no" in response:
        return "no"
    else:
        return "uncertain"
   


In [21]:
sample_questions = [
    "Did Malerie Grady provide the voice for the character?",
    "In the movie 'The Oogieloves in the Big Balloon Adventure,' was a balloon red?",
    "Is the movie considered a musical?"
]

# Debugging: Print generated answers for sample questions
for question in sample_questions:
    answer = generate_response(question)
    print(f"Question: {question}")
    print(f"Generated Answer: {answer}\n")

Question: Did Malerie Grady provide the voice for the character?
Generated Answer: yes

Question: In the movie 'The Oogieloves in the Big Balloon Adventure,' was a balloon red?
Generated Answer: yes

Question: Is the movie considered a musical?
Generated Answer: yes



In [22]:
# Process each CSV file in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        filepath = os.path.join(input_folder, filename)
        data = pd.read_csv(filepath)
        
        # Check if 'Question' column exists
        
        if 'Question' in data.columns:
            # Initialize an empty list to collect answers for preview
            
            generated_answers = []
            
            # Process each question one at a time
            
            for index, row in data.iterrows():
                question = row['Question']
                answer = generate_response(question)
                
                # Append the generated answer for preview
                
                generated_answers.append(answer)
                
                # Preview after each question
                print(f"Question: {question}")
                print(f"Generated Answer: {answer}\n")
            
            # Add the generated answers to the DataFrame
            
            data['generated_answer'] = generated_answers
            
            # Save the new CSV file with generated answers in the output folder
            
            output_path = os.path.join(output_folder, filename)
            data.to_csv(output_path, index=False)
            print(f"Processed and saved: {output_path}\n")

print("All files processed and saved with Falcon answers.")

Question: "Did Malerie Grady provide the voice for the character Toofie in the movie 'The Oogieloves in the Big Balloon Adventure 2012'?"
Generated Answer: yes

Question: "In the movie 'The Oogieloves in the Big Balloon Adventure 2012', was the character Toofie voiced by someone other than Malerie Grady?"
Generated Answer: yes

Question: "Did Alex Greene play the role of a villain in the movie 'The Oogieloves in the Big Balloon Adventure 2012'?"
Generated Answer: yes

Question: "In the movie 'The Oogieloves in the Big Balloon Adventure 2012', did Alex Greene perform the in-suit role for the character Toofie?"
Generated Answer: yes

Question: "Did Stephanie Renz provide the voice for the character Goobie in the movie 'The Oogieloves in the Big Balloon Adventure 2012'?"
Generated Answer: yes

Question: "In the movie 'The Oogieloves in the Big Balloon Adventure 2012', was the character Zoozie voiced by Stephanie Renz?"
Generated Answer: yes

Question: "Did Carol Sweeney play the role of a

KeyboardInterrupt: 

In [None]:
input_folder1 = '/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/yuvraj/questions/popular/hop1/YES_NO_cleaned'
output_folder1 = '/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/alyona/results1'
os.makedirs(output_folder, exist_ok=True)

In [None]:
# Process each CSV file in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        filepath = os.path.join(input_folder, filename)
        data = pd.read_csv(filepath)
        
        # Check if 'Question' column exists
        
        if 'Question' in data.columns:
            # Initialize an empty list to collect answers for preview
            
            generated_answers = []
            
            # Process each question one at a time
            
            for index, row in data.iterrows():
                question = row['Question']
                answer = generate_response(question)
                
                # Append the generated answer for preview
                
                generated_answers.append(answer)
                
                # Preview after each question
                print(f"Question: {question}")
                print(f"Generated Answer: {answer}\n")
            
            # Add the generated answers to the DataFrame
            
            data['generated_answer'] = generated_answers
            
            # Save the new CSV file with generated answers in the output folder
            
            output_path = os.path.join(output_folder, filename)
            data.to_csv(output_path, index=False)
            print(f"Processed and saved: {output_path}\n")

print("All files processed and saved with Falcon answers.")

In [None]:
# one hop/ yes-no evaluation:
#     least popular:
#         aspectwise accuracy check: (cast, production,plot)
#     popular:
#         aspectwise accuracy check: (cast, production,plot )

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

def calculate_accuracy(Answer, generated_answers):
    correct = sum(Answer == generated_answers)
    total = len(Answer)
    accuracy = correct / total * 100 if total > 0 else 0
    return accuracy

# Folders for Popular and Least Popular movies
popular_folder = '/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/alyona/results1'
least_popular_folder = '/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/alyona/results'

# Initialize dictionaries to store accuracy results



accuracy_results = {
    'Least Popular': {'Cast': [], 'Production': [], 'Plot': []},
    'Popular': {'Cast': [], 'Production': [], 'Plot': []},
}

#  to process a folder and calculate accuracy
def process_folder(folder, category):
    for filename in os.listdir(folder):
        if filename.endswith('.csv'):
            filepath = os.path.join(folder, filename)
            data = pd.read_csv(filepath)
            
            
            if 'Question' in data.columns and 'True Answer' in data.columns and 'generated_answer' in data.columns:
                
                cast_questions = data[data['Question'].str.contains('actor|actress|cast', case=False, na=False)]
                production_questions = data[data['Question'].str.contains('production|filming|studio', case=False, na=False)]
                plot_questions = data[data['Question'].str.contains('plot|story|event', case=False, na=False)]
                
                # Calculate accuracy for each aspect
                cast_accuracy = calculate_accuracy(cast_questions['True Answer'], cast_questions['generated_answer'])
                production_accuracy = calculate_accuracy(production_questions['True Answer'], production_questions['generated_answer'])
                plot_accuracy = calculate_accuracy(plot_questions['True Answer'], plot_questions['generated_answer'])
                
                # Add results to the accuracy dictionary
                accuracy_results[category]['Cast'].append(cast_accuracy)
                accuracy_results[category]['Production'].append(production_accuracy)
                accuracy_results[category]['Plot'].append(plot_accuracy)

# Process both Popular and Least Popular folders
process_folder(popular_folder, 'Popular')
process_folder(least_popular_folder, 'Least Popular')

# Create a DataFrame to store the accuracy results in a table
accuracy_df = pd.DataFrame({
    'Category': ['Least Popular', 'Popular'],
    'Cast Accuracy': [sum(accuracy_results['Least Popular']['Cast']) / len(accuracy_results['Least Popular']['Cast']),
                      sum(accuracy_results['Popular']['Cast']) / len(accuracy_results['Popular']['Cast'])],
    'Production Accuracy': [sum(accuracy_results['Least Popular']['Production']) / len(accuracy_results['Least Popular']['Production']),
                            sum(accuracy_results['Popular']['Production']) / len(accuracy_results['Popular']['Production'])],
    'Plot Accuracy': [sum(accuracy_results['Least Popular']['Plot']) / len(accuracy_results['Least Popular']['Plot']),
                      sum(accuracy_results['Popular']['Plot']) / len(accuracy_results['Popular']['Plot'])],
})

# Display the table

print(accuracy_df)

# Plotting the graph

accuracy_df.set_index('Category', inplace=True)
accuracy_df.plot(kind='bar', figsize=(10, 6))

# Customizing the plot

plt.title('Accuracy Comparison: Least Popular vs Popular Movies')
plt.xlabel('Movie Category')
plt.ylabel('Accuracy (%)')
plt.xticks(rotation=0)
plt.legend(title='Aspect', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.tight_layout()
plt.show()
