In [1]:
import os
import pandas as pd
import numpy as np


In [2]:

seed_value = 42
np.random.seed(seed_value)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Movie Selection

In [3]:
# display the aspect of a movie
# display the nodes and ask for node-edge relation authenticity 
# display question from the aspects and ask for the meaningfulness and complexity level of the question.
# save the authenticity score, question meaningfulness and complexity score with the ground truth value


# process for each category:
# -------------------------------
# 1. sort the movie files from the dataset folder aspectwise
# 2. preprocess the files based on the criteria we followed 
# 3. chose 30 percent of the non empty files with atleast 10 node-edge pairs and 10 questions using random sampling.
# 4. choose on file with aspect, fetch its aspect information from the wikipedia page
# 5. display its aspect content and then one by one present the node-edge pairs for authenticity test
# 6. for each pair, receive the response over five scale and save the responses in a dictionary key will be the scale value and the value will be the count of the scale.
# 7. after the nodes evaluation of the movie, display the 30 percent question sampled randomly.
# 8. display the question, get the response of complexity test and save the match-mismatch score hopwise,questionwise. save the response in a dictionary with the proper keys and values.
# 9. save the dictionary and file names already checked.


In [4]:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
from tqdm import tqdm

In [5]:
    
def save_aspect(aspect, content, movie_name, yor, save_file_path = "/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/Analysis/movie.txt"):
    # print(f"from save aspect, {aspect}")
    introduction = f"Movie_name = {movie_name}, Year or release = {yor}.\n HERE IS THE DETAILS OF MOVIE'S {aspect.upper()}: \n"
    underline = "-----------------------------------------------------\n"
    para = introduction + underline + content
    # print(para)
    # input()
    # print(para)
    # Open the file in write mode and save the paragraph
    with open(save_file_path, 'w') as file:
        file.write(para)
    
    # print(f"Paragraph saved to {save_file_path} for {aspect}.")


def scrape(wiki_link):
    Aspects = [
        'Plot',
        'Cast',
        'Production',
        'Music',
        'Soundtrack',
        'Themes',
        'Accolades',
        ]

    Aspects = [aspect.lower() for aspect in Aspects]
    
    # Send a request to the Wikipedia page
    response = requests.get(wiki_link)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    scraped_dict = {}
    
    #-------------------Summary scraping--------------------------
    # Find the first paragraph after the title, which is usually the summary
    summary_paragraphs = []
    # Wikipedia's summary paragraphs are inside <p> tags but before any <h2> tag
    for paragraph in soup.find_all('p'):
        
        # Ensure the paragraph has text and is not empty
        if paragraph.get_text().strip():
            summary_paragraphs.append(paragraph.get_text().strip())
        
        # Stop once we hit the first section heading (e.g. 'Plot' or 'Contents')
        if paragraph.find_next_sibling(['h2', 'h3']):
            break
    
    scraped_dict["summary"] =  ' '.join(summary_paragraphs)
    
    
    # -------------------Other Aspect Scraping---------------------
    Headings = soup.find_all('div', class_ = 'mw-heading mw-heading2')
    for heading in Headings:
        try:
            aspect, _ = (heading.text).split('[')
        except:
            aspect = heading.text
            
        if aspect.lower() in Aspects:
            next_siblings = heading.find_next_siblings()
            text = ''
            for next_sibling in next_siblings:
                next_sibling_name = next_sibling.name
                # sub_sibling = ''
                # print(f"............next_sibling_name = {next_sibling_name}")
                if (next_sibling_name =='style'):
                    continue
                
                elif (next_sibling_name == 'div'):
                    clss = next_sibling.get('class')
                    
                    if ('mw-heading2' in clss):  # break because heading ended
                        break
                text += " "+ next_sibling.text
            scraped_dict[aspect] = text 
    return scraped_dict

In [6]:
def preprocess_node_df(df):
    # Define a function to check if the edge contains both node names and at least 5 words
    def is_valid_edge(row):
        edge_words = row['edge'].split()
        return (
            len(edge_words) >= 5 and 
            row['node_1'] in row['edge'] and 
            row['node_2'] in row['edge']
        )
    
    # Filter the dataframe using the validation function
    filtered_df = df[df.apply(is_valid_edge, axis=1)].reset_index(drop=True)
    
    return filtered_df

def randomPick_question(df):
    # Create an empty dataframe
    random_row = pd.DataFrame()
    
    # Create an empty dataframe with predefined columns
    random_row = pd.DataFrame(columns=['Column1', 'Column2', 'Column3'])
    
    total_question = len(df)
    if total_question >= 10:
        sample_30_count = int(0.3*total_question)
        # Randomly select a single row
        random_row = df.sample(n=sample_30_count, random_state=seed_value)
        # random_rows = df.sample(n=n, random_state=seed_value)
    
    return random_row

def read_csv(file_path:str):
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
        except:
            df = pd.DataFrame()
    else:
        df = pd.DataFrame()
        
    return df
    

def fetch_questions(category, node_file_name, hop):
    questionFile_root = "/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/Dataset/Questions_Answers/Hollywood"
    
    hop_yes_no_path = os.path.join(questionFile_root, category, hop, "yes_no", node_file_name)
    hop_MCQ_S_path = os.path.join(questionFile_root, category, hop, "MCQ_single_correct", node_file_name)
    hop_MCQ_M_path = os.path.join(questionFile_root, category, hop, "MCQ_Multiple_correct", node_file_name)
    
    
    hop_yes_no_questions = randomPick_question(read_csv(hop_yes_no_path))
    hop_MCQ_S_questions = randomPick_question(read_csv(hop_MCQ_S_path))
    hop_MCQ_M_questions = randomPick_question(read_csv(hop_MCQ_M_path))
    
    # print("from fetch_questions")
    # print("-----------------------------")
    # print(len(hop_yes_no_questions))
    # print(len(hop_MCQ_S_questions))
    # print(len(hop_MCQ_M_questions))
    
    return hop_yes_no_questions, hop_MCQ_S_questions, hop_MCQ_M_questions
    

In [7]:
columns = ["YoR", "movie_name", "imdb_rating", "wiki_link", "popular"]
movie_links = pd.read_excel("/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/aspect_wise_knowledge_graph/workspace/Movie_list.xlsx", sheet_name = "hollywood", engine='openpyxl')
movie_links.columns = columns


In [None]:

nodeFile_root = "/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/Dataset/Nodes_Edges/Hollywood"
questionFile_root = ""

Categories = os.listdir(nodeFile_root)


aspect_list = ['Accolades',
'Cast',
'Guardians of the Galaxy Vol',
'Music',
'Plot',
'Production',
'Soundtrack',
'Themes',
'summary']

for category in Categories:
    for aspect in aspect_list:
        # 8. add aspect in both dataframe of node and question
        
        node_file_names = os.listdir(os.path.join(nodeFile_root, category))
        node_file_names = [node_file_name for node_file_name in node_file_names if node_file_name.split(".")[0].split("_")[-1] == aspect]
        for node_file_name in node_file_names:
            # 9. check if the file already covered then continue otherwise procede
            
            # get the context from wikipedia
            movie_name, yor = node_file_name.split("_")[0], node_file_name.split("_")[1]
            print(movie_name)
            wiki_link = movie_links[movie_links.movie_name == movie_name]["wiki_link"].iloc[0]

            scraped_dict = scrape(wiki_link)
            
            try:
                node_file = pd.read_csv(os.path.join(nodeFile_root, category, node_file_name))

                sample_30_count = int(0.3*len(node_file))
                node_file = node_file.sample(n=sample_30_count, random_state=seed_value)
                
            except:
                
                continue
                
            if len(node_file)>=3:
                
                h1q1, h1q2, h1q3 = fetch_questions(category, node_file_name, "Simple_hop1")
                h2q1, h2q2, h2q3 = fetch_questions(category, node_file_name, "Moderate_hop2")
                h3q1, h3q2, h3q3 = fetch_questions(category, node_file_name, "Complex_hop3")
                
                try:
                    #save the scraped aspect of the considered movie                
                    save_aspect(aspect, scraped_dict[aspect], movie_name, yor)
                    print("\n Please check the content from the movie.txt file and then response the questions here")
                    print("-----------------------------------------------------------------------------------------")                    
                    input()
                    
                    # 1. Evaluate nodes (use terminal clear in the eval method)
                    eval_node_dict = eval_nodes(node_file) #returns only the counts
                    
                    # 2. update the node evaluation dataframe of this aspect
                    
                    # 3. Evaluate Questions
                    eval_question_h1_dict = eval_question(h1q1, h1q2, h1q3)
                    eval_question_h2_dict = eval_question(h2q1, h2q2, h2q3)
                    eval_question_h3_dict = eval_question(h3q1, h3q2, h3q3)
                    
                    # 4. merge all the question evaluation
                    
                    # 5. update the question evaluation dataframe of this aspect
                    
                    # 6. save the movie file name so that recompute is omit
                    
                    # 7. check the movie count for taking decision for further process
                    
                    
                except:
                    continue
                    
                    
            
        


Gods of Egypt

 Please check the content from the movie.txt file and then response the questions here
-----------------------------------------------------------------------------------------
Fantasy Island
Baywatch
Hot Pursuit
The Oogieloves in the Big Balloon Adventure

 Please check the content from the movie.txt file and then response the questions here
-----------------------------------------------------------------------------------------
In the Name of the King

 Please check the content from the movie.txt file and then response the questions here
-----------------------------------------------------------------------------------------
All About Steve

 Please check the content from the movie.txt file and then response the questions here
-----------------------------------------------------------------------------------------
Fifty Shades Freed

 Please check the content from the movie.txt file and then response the questions here
-----------------------------------------------

In [None]:
# Send a request to the Wikipedia page
response = requests.get("https://en.wikipedia.org/wiki/Fantasy_Island_(film)")

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')


scraped_dict = {}
    
#-------------------Summary scraping--------------------------
# Find the first paragraph after the title, which is usually the summary
summary_paragraphs = []
# Wikipedia's summary paragraphs are inside <p> tags but before any <h2> tag
for paragraph in soup.find_all('p'):
    
    # Ensure the paragraph has text and is not empty
    if paragraph.get_text().strip():
        summary_paragraphs.append(paragraph.get_text().strip())
    
    # Stop once we hit the first section heading (e.g. 'Plot' or 'Contents')
    if paragraph.find_next_sibling(['h2', 'h3']):
        break

scraped_dict["summary"] =  ' '.join(summary_paragraphs)
scraped_dict

# # -------------------Other Aspect Scraping---------------------
# Headings = soup.find_all('div', class_ = 'mw-heading mw-heading2')
# for heading in Headings:
#     try:
#         aspect, _ = (heading.text).split('[')
#     except:
#         aspect = heading.text
        
#     if aspect.lower() in Aspects:
#         next_siblings = heading.find_next_siblings()
#         text = ''
#         for next_sibling in next_siblings:
#             next_sibling_name = next_sibling.name
#             # sub_sibling = ''
#             # print(f"............next_sibling_name = {next_sibling_name}")
#             if (next_sibling_name =='style'):
#                 continue
            
#             elif (next_sibling_name == 'div'):
#                 clss = next_sibling.get('class')
                
#                 if ('mw-heading2' in clss):  # break because heading ended
#                     break
#             text += " "+ next_sibling.text
#         scraped_dict[aspect] = text 
# return scraped_dict

In [None]:
import pandas as pd
import random

# Sample data for nouns
nouns = ["tree", "car", "river", "mountain", "city", "book", "ocean", "computer", "bird", "house"]

# Function to generate a sentence for the edge
def generate_edge(node1, node2):
    # torch.manual_seed(42)
    include_nodes = random.choice([True, False])  # Randomly decide whether to include nodes
    if include_nodes:
        # Include node1 and node2 in the edge sentence
        return f"{node1} is connected to {node2} in some way."
    else:
        # Generate a random sentence without mentioning node1 and node2
        sentences = [
            "There is a strong relationship here.",
            "The connection is abstract but meaningful.",
            "This is a symbolic link.",
            "A significant association exists.",
            "A conceptual bond is present."
        ]
        return random.choice(sentences)

# Generate 10 rows for the DataFrame
data = []
for _ in range(10):
    node1, node2 = random.sample(nouns, 2)  # Randomly select two different nouns
    edge = generate_edge(node1, node2)     # Generate the edge sentence
    data.append({"node_1": node1, "node_2": node2, "edge": edge})

# Create the DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


In [5]:
import pandas as pd
import numpy as np

def rndm(df):
    # Randomly choose rows
    n = 3  # Number of rows to select
    df = df.sample(n=n, random_state=seed_value)
    return df
    

# Example DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
}
df = pd.DataFrame(data)
print(df)

# Set the seed value for reproducibility
seed_value = 42
np.random.seed(seed_value)


df = rndm(df)

print("Randomly selected rows:")
print(df)


   A   B    C
0  1  10  100
1  2  20  200
2  3  30  300
3  4  40  400
4  5  50  500
Randomly selected rows:
   A   B    C
1  2  20  200
4  5  50  500
2  3  30  300


In [None]:
pd_file = pd.read_csv("/mnt/Data/prabirmondal/prabir/python_program/movie_sense/SRI_KG/Movie_sense_KG/Movie_sense_KG/Dataset/Nodes_Edges/Hollywood/Least_Popular/All About Steve_2009_Plot.csv")

# print(pd_file.head())

print(len(pd_file))
pd_file.columns

pd_file = preprocess_node_df(pd_file)
pd_file.head()

# len(pd_file)
# pd_file.columns

In [None]:
pd_file = pd_file.sample(n=4)
pd_file.head()

In [None]:

for index, row in pd_file.iterrows():
    node1 = row["node_1"]
    node2 = row["node_2"]
    edge = row["edge"]
    print(row["firstname"])