In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Data Scraping

In [18]:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
from tqdm import tqdm

In [19]:

def scrap_data(wiki_link, save_file_path = "/workspace/orkspace/data_input/cureus/movie.txt"):
    wikipedia_movie_link = wiki_link
    
    page_to_scrape = requests.get(wikipedia_movie_link)
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    
    para = ''
    for paragraph in soup.select('p'):
        p = paragraph.getText()
        para += p


     # Open the file in write mode and save the paragraph
    with open(save_file_path, 'w') as file:
        file.write(para)
    
    # print(f"Paragraph saved to {save_file_path}")

In [20]:
def save_aspect(aspect, content, save_file_path = "/workspace/orkspace/data_input/cureus/movie.txt"):
    # print(f"from save aspect, {aspect}")
    introduction = f"HERE IS THE DETAILS OF MOVIE'S {aspect.upper()}: \n"
    underline = "-----------------------------------------------------\n"
    para = introduction + underline + content
    # print(para)
    # input()
    # print(para)
    # Open the file in write mode and save the paragraph
    with open(save_file_path, 'w') as file:
        file.write(para)
    
    # print(f"Paragraph saved to {save_file_path} for {aspect}.")


def scrape(wiki_link):
    Aspects = [
        'Plot',
        'Cast',
        'Production',
        'Music',
        'Soundtrack',
        'Themes',
        'Accolades',
        ]

    Aspects = [aspect.lower() for aspect in Aspects]
    
    # Send a request to the Wikipedia page
    response = requests.get(wiki_link)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    scraped_dict = {}
    
    #-------------------Summary scraping--------------------------
    # Find the first paragraph after the title, which is usually the summary
    summary_paragraphs = []
    # Wikipedia's summary paragraphs are inside <p> tags but before any <h2> tag
    for paragraph in soup.find_all('p'):
        
        # Ensure the paragraph has text and is not empty
        if paragraph.get_text().strip():
            summary_paragraphs.append(paragraph.get_text().strip())
        
        # Stop once we hit the first section heading (e.g. 'Plot' or 'Contents')
        if paragraph.find_next_sibling(['h2', 'h3']):
            break
    
    scraped_dict["summary"] =  ' '.join(summary_paragraphs)
    
    
    # -------------------Other Aspect Scraping---------------------
    Headings = soup.find_all('div', class_ = 'mw-heading mw-heading2')
    for heading in Headings:
        try:
            aspect, _ = (heading.text).split('[')
        except:
            aspect = heading.text
            
        if aspect.lower() in Aspects:
            next_siblings = heading.find_next_siblings()
            text = ''
            for next_sibling in next_siblings:
                next_sibling_name = next_sibling.name
                # sub_sibling = ''
                # print(f"............next_sibling_name = {next_sibling_name}")
                if (next_sibling_name =='style'):
                    continue
                
                elif (next_sibling_name == 'div'):
                    clss = next_sibling.get('class')
                    
                    if ('mw-heading2' in clss):  # break because heading ended
                        break
                text += " "+ next_sibling.text
            scraped_dict[aspect] = text 
    return scraped_dict

## Loading Document

In [21]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

from helpers.df_helpers import documents2Dataframe

In [22]:
## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"/workspace/orkspace/data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"/workspace/orkspace/data_output/{out_dir}")

In [23]:
# def load_document(inputdirectory):
#     loader = DirectoryLoader(inputdirectory, show_progress=True)
#     documents = loader.load()
    
#     splitter = RecursiveCharacterTextSplitter(
#         chunk_size=1500,
#         chunk_overlap=150,
#         length_function=len,
#         is_separator_regex=False,
#     )
    
#     pages = splitter.split_documents(documents)
#     # print("Number of chunks = ", len(pages))
#     # print(pages[3].page_content)

#     # Create dataframe of chunks
#     df = documents2Dataframe(pages)
#     print("from load document")
#     print(df.head())
#     # df.head()
    
    # return df

## Node generation

In [24]:
# sudo apt-get install libmagic1
# curl -fsSL https://ollama.com/install.sh | sh
#ollama serve
#ollama run zephyr

In [25]:
# !pip install openpyxl
# !pip install pandas
# !pip install langchain
# !pip install -U langchain-community
# !pip install unstructured
# !pip install yachalk

In [26]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

In [27]:
# def node_generation(df):
#     ## To regenerate the graph with LLM, set this to True
#     regenerate = True
    
#     if regenerate:
#         concepts_list = df2Graph(df, model='zephyr:latest')
#         dfg1 = graph2Df(concepts_list)
#         if not os.path.exists(outputdirectory):
#             os.makedirs(outputdirectory)
        
#         dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
#         df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
#     else:
#         dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")
    
#     dfg1.replace("", np.nan, inplace=True)
#     dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
#     dfg1['count'] = 4 
#     ## Increasing the weight of the relation to 4. 
#     ## We will assign the weight of 1 when later the contextual proximity will be calculated. 

#     print("from node generation")
#     print(dfg1.shape)
#     print(dfg1.head())

#     return dfg1

## Strat from here

In [28]:
# !pip install openpyxl

In [29]:
import pandas as pd
import os

In [30]:
columns = ["YoR", "movie_name", "imdb_rating", "wiki_link", "popular"]
movie_links = pd.read_excel("/workspace/orkspace/Movie_list.xlsx", sheet_name = "bollywood", engine='openpyxl')
movie_links.columns = columns
# movie_links.head()

In [31]:
popular_movie_links = movie_links[movie_links.popular == "popular"]
least_popular_movie_links = movie_links[movie_links.popular == "Least popular"]
# least_popular_movie_links.head()

In [32]:
root_output_folder = "/workspace/orkspace/bollywood"
movie_categories = [least_popular_movie_links, popular_movie_links]

for movie_category in tqdm(movie_categories):
    for index, row in movie_category.iterrows():
        
        movie_name = row["movie_name"]
        YoR = row["YoR"]
        wiki_link = row["wiki_link"]
        popular = row["popular"]

        # scrape data from the given link aspectwise and get it in a dictionary:
        aspect_dict = scrape(wiki_link)
        
        # Now, generate nodes from each aspect individually after saving their content in the data_input directory one by one.
        for aspect in aspect_dict:
            save_aspect(aspect, aspect_dict[aspect])

            try:

                #load document in dataframe chunk
                loader = DirectoryLoader(inputdirectory, show_progress=True)
                documents = loader.load()
                
                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1500,
                    chunk_overlap=150,
                    length_function=len,
                    is_separator_regex=False,
                )            
                pages = splitter.split_documents(documents)
    
                
                # Create dataframe of chunks
                df = documents2Dataframe(pages)
    
                #node generation task
                ## To regenerate the graph with LLM, set this to True
                regenerate = True
                
                if regenerate:
                    concepts_list = df2Graph(df, model='zephyr:latest')
                    dfg1 = graph2Df(concepts_list)
                    if not os.path.exists(outputdirectory):
                        os.makedirs(outputdirectory)
                    
                    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
                    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
                else:
                    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")
                
                dfg1.replace("", np.nan, inplace=True)
                dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
                dfg1['count'] = 4  
                
                #save the nodes dataframe in csv_file
                save_folder_name = popular
                save_file_name = movie_name + "_" + str(YoR) + "_" + aspect +".csv"
                save_path = os.path.join(root_output_folder, save_folder_name, save_file_name) 
                # print(f"SAVE PATH = {save_path}")
                # break
                dfg1.to_csv(save_path, index=False)

            except:
                continue

  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 32.51it/s]

100%|██████████| 1/1 [00:00<00:00, 16.43it/s]

100%|██████████| 1/1 [00:00<00:00, 30.19it/s]

100%|██████████| 1/1 [00:00<00:00, 84.66it/s]

100%|██████████| 1/1 [00:00<00:00, 29.06it/s]

100%|██████████| 1/1 [00:00<00:00, 35.01it/s]

100%|██████████| 1/1 [00:00<00:00, 20.58it/s]

100%|██████████| 1/1 [00:00<00:00, 48.85it/s]

100%|██████████| 1/1 [00:00<00:00, 22.61it/s]

100%|██████████| 1/1 [00:00<00:00, 41.73it/s]

100%|██████████| 1/1 [00:00<00:00, 57.42it/s]

100%|██████████| 1/1 [00:00<00:00, 64.55it/s]

100%|██████████| 1/1 [00:00<00:00, 25.98it/s]

100%|██████████| 1/1 [00:00<00:00, 28.64it/s]

100%|██████████| 1/1 [00:00<00:00, 24.19it/s]

100%|██████████| 1/1 [00:00<00:00, 61.87it/s]

100%|██████████| 1/1 [00:00<00:00, 161.11it/s]

100%|██████████| 1/1 [00:00<00:00, 27.26it/s]

100%|██████████| 1/1 [00:00<00:00, 41.09it/s]

100%|██████████| 1/1 [00:00<00:00, 41.23it/s]

100%|██████████| 1/1 