In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from IPython.display import display

# Define the base URL of the website to scrape
base_url = 'https://malayalamsubtitles.org/page/'

# Define the number of pages to scrape
num_pages = 2

# Initialize a list to store all scraped data
all_data = []

# Function to extract parts of the text using regular expressions
def extract_parts(text):
    match = re.match(r'(.*?)\sഭാഷ\s(.*?)\sസംവിധാനം\s(.*?)\sപരിഭാഷ\s(.*?)\sജോണർ\s(.*)', text)
    if match:
        return match.groups()
    else:
        return (None, None, None, None, None)

# Function to extract rating
def extract_rating(text):
    match = re.search(r'(\d\.\d)/10', text)
    if match:
        return match.group(1) + "/10"
    else:
        return None

# Function to extract director name
def extract_director(text):
    match = re.search(r'ജോണർ\s(.*)\s\d\.\d/10', text)
    if match:
        return match.group(1)
    else:
        return None

# Retry logic
retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"]
)
adapter = HTTPAdapter(max_retries=retry_strategy)

# Loop through each page
for page_number in range(1, num_pages + 1):
    # Construct the URL for the current page
    url = f'{base_url}{page_number}/'
    
    # Send a GET request to the URL with retry logic
    with requests.Session() as session:
        session.mount("https://", adapter)
        try:
            response = session.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve the webpage: {url}")
            continue
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract specific information from the page
    # For example, let's extract all the links on the page
    links = soup.find_all('a')
    
    # Set to store unique links
    unique_links = set()
    
    # Filter and add unique links that start with 'https://malayalamsubtitles.org/languages'
    for link in links:
        href = link.get('href', '')
        if href.startswith('https://malayalamsubtitles.org/languages'):
            unique_links.add(href)
    
    # Iterate through unique links and scrape desired information
    for link in unique_links:
        # Send a GET request to the link with retry logic
        with requests.Session() as session:
            session.mount("https://", adapter)
            try:
                sub_response = session.get(link)
                sub_response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Failed to retrieve content from {link}")
                continue
        
        # Parse the HTML content of the subpage
        sub_soup = BeautifulSoup(sub_response.content, 'html.parser')
        
        # Find the tag containing the specified class attribute
        download_links = sub_soup.find_all('a', class_='wpdm-download-link download-on-click btn btn-primary', rel='nofollow', href='#')
        
        # Extract data-downloadurl attribute from each tag
        for download_link in download_links:
            data_downloadurl = download_link.get('data-downloadurl')
            
            # Find the meta tags with property="og:description", "og:image", and "og:title"
            og_description = sub_soup.find('meta', property='og:description').get('content', '') if sub_soup.find('meta', property='og:description') else None
            og_image = sub_soup.find('meta', property='og:image').get('content', '') if sub_soup.find('meta', property='og:image') else None
            og_title = sub_soup.find('meta', property='og:title').get('content', '') if sub_soup.find('meta', property='og:title') else None

             # Find the figure tag with class="alignleft size-large"
            figure_tag = sub_soup.find('figure', class_='alignleft size-large')
            figure_href = figure_tag.find('a')['href'] if figure_tag else None
          
        # Find the tags with class="entry-categories" and "entry-tags"
            entry_categories_tag = sub_soup.find('span', class_='entry-categories')
            entry_categories_content = entry_categories_tag.text.strip() if entry_categories_tag else None
            
            entry_tags_tag = sub_soup.find('span', class_='entry-tags')
            entry_tags_content = entry_tags_tag.text.strip() if entry_tags_tag else None

            # Append the extracted data to the all_data list
            all_data.append((link, data_downloadurl, og_description, og_image, og_title, figure_href, entry_categories_content, entry_tags_content))

# Create a DataFrame with URL, data-downloadurl, og:description, og:image, og:title, author, figure_href, entry-categories, and entry-tags attributes
df = pd.DataFrame(all_data, columns=['URL', 'data-downloadurl', 'og:description', 'og:image', 'og:title', 'figure_href', 'entry-categories', 'entry-tags'])


# Add the 'director_to_rating' column
df['director_to_rating'] = df['og:description'].apply(extract_director)

# Add the 'rating' column
df['rating'] = df['og:description'].apply(extract_rating)

# Apply the function to the 'og:description' column and expand it into separate columns
df[['Column1', 'Column2', 'Column3', 'Column4', 'Column5']] = df['og:description'].apply(lambda x: pd.Series(extract_parts(x)))

# Assuming df is your DataFrame and you want to remove the 'Column5' column
df.drop(columns=['Column5'], inplace=True)

# display(df)
# print(df.columns)
# # Save the DataFrame as an Excel file
# excel_file_path = 'malayalamsubtitles_data.xlsx'
# df.to_excel(excel_file_path, index=False)
# print("Data has been saved to 'scraped_data.xlsx'")

#print(df.columns)
# Rename duplicate columns to make them unique
df = df.rename(columns={'Column1': 'Column1_new', 'Column2': 'Column2_new', 'Column3': 'Column3_new', 'Column4': 'Column4_new'})

# Define the desired column order
desired_order = ['Column1_new', 'og:title', 'Column2_new', 'Column3_new', 'entry-tags', 'Column4_new', 'entry-categories', 'URL', 'og:image', 'figure_href', 'rating', 'director_to_rating', 'data-downloadurl', 'og:description']

# Reindex columns
df = df.reindex(columns=desired_order)

# Rename columns
df = df.rename(columns={'Column1_new': 'release_no', 'og:title': 'title', 'Column2_new': 'language', 'Column3_new': 'director', 'entry-tags': 'author', 'Column4_new': 'translated_by', 'entry-categories': 'tags', 'URL': 'website_link', 'og:image': 'poster', 'figure_href': 'imdb', 'rating': 'rating', 'director_to_rating': 'genre', 'data-downloadurl': 'download_link', 'og:description': 'description'})

# Display the DataFrame
# display(df)



# Split the 'title' column into two columns
df[['title1', 'title2']] = df['title'].str.split(' / ', expand=True)

# Remove the data after the term '(' in the 'title' column
df['title2'] = df['title2'].str.split('(', n=1).str[0]

# Remove the end part '- എംസോൺ' from every row in the 'title' column
df['title'] = df['title'].str.replace(' - എംസോൺ$', '')

# Remove the term 'എംസോൺ റിലീസ് – ' from every row in the 'release_num' column
df['release_no'] = df['release_no'].str.replace('എംസോൺ റിലീസ് – ', '')

# Extract the data within brackets and put it into a new column
df['year'] = df['title'].str.extract(r'\((\d{4})\)')


# Extract the term after 'languages/' up to the next '/'
df['lang'] = df['website_link'].str.extract(r'/languages/([^/]+)/')

# Remove the word "Tagged: " from the 'entry-tags' column
df['author'] = df['author'].str.replace('Tagged: ', '')

# Remove the word "Tagged: " from the 'entry-tags' column
df['tags'] = df['tags'].str.replace('Filed Under: ', '')



# Rearrange the last 4 columns in front of the second column
df.insert(1, 'title1', df.pop('title1'))
df.insert(2, 'title2', df.pop('title2'))
df.insert(3, 'year', df.pop('year'))
df.insert(4, 'lang', df.pop('lang'))


# Display the DataFrame
display(df)
# Save the DataFrame as an Excel file
excel_file_path = 'malayalamsubles_data.xlsx'
df.to_excel(excel_file_path, index=False)
print("Data has been saved to 'scraped_data.xlsx'")


  from pandas.core import (
  retry_strategy = Retry(


Unnamed: 0,release_no,title1,title2,year,lang,title,language,director,author,translated_by,tags,website_link,poster,imdb,rating,genre,download_link,description
0,3330,Damsel,ഡാംസെൽ,2024,english,Damsel / ഡാംസെൽ (2024) - എംസോൺ,ഇംഗ്ലീഷ്,Juan Carlos Fresnadillo,Giri PS,ഗിരി പി. എസ്.,"Action, Adventure, English, Fantasy",https://malayalamsubtitles.org/languages/engli...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt13452446/,6.1/10,"ആക്ഷൻ, അഡ്വഞ്ചർ, ഫാന്റസി",https://malayalamsubtitles.org/download/damsel...,എംസോൺ റിലീസ് – 3330 ഭാഷ ഇംഗ്ലീഷ് സംവിധാനം Juan...
1,887,Always: Sunset on Third Street,ഓൾവേസ്: സൺസെറ്റ് ഓൺ തേഡ് സ്ട്രീറ്റ്,2005,japanese,Always: Sunset on Third Street / ഓൾവേസ്: സൺസെറ...,ജാപ്പനീസ്,Takashi Yamazaki,Subeesh Chittariparamb,സുബീഷ് ചിറ്റാരിപ്പറമ്പ്.,"Comedy, Drama, Family, Japanese",https://malayalamsubtitles.org/languages/japan...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt0488870/,7.7/10,"കോമഡി, ഡ്രാമ, ഫാമിലി",https://malayalamsubtitles.org/download/always...,എംസോൺ റിലീസ് – 887 ഭാഷ ജാപ്പനീസ് സംവിധാനം Taka...
2,3329 ഓസ്കാർ ഫെസ്റ്റ് 2024 – 09,Perfect Days,പെർഫക്റ്റ് ഡേയ്സ്,2023,japanese,Perfect Days / പെർഫക്റ്റ് ഡേയ്സ് (2023) - എംസോൺ,ജാപ്പനീസ്,Wim Wenders,Elvin John Paul,എല്‍വിന്‍ ജോണ്‍ പോള്‍,"Drama, Japanese, Oscar Fest 2024",https://malayalamsubtitles.org/languages/japan...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt27503384/,7.9/10,ഡ്രാമ,https://malayalamsubtitles.org/download/perfec...,എംസോൺ റിലീസ് – 3329 ഓസ്കാർ ഫെസ്റ്റ് 2024 – 09 ...
3,217,Indiana Jones and the Kingdom of the Crystal S...,ഇൻഡിയാന ജോൺസ് ആൻഡ് ദ കിങ്ഡം ഓഫ് ദ ക്രിസ്റ്റൽ സ...,2008,english,Indiana Jones and the Kingdom of the Crystal S...,ഇംഗ്ലീഷ്,Steven Spielberg,Vishnu Prasad,വിഷ്ണു പ്രസാദ്,"Action, Adventure, English",https://malayalamsubtitles.org/languages/engli...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt0367882/,6.2/10,"ആക്ഷൻ, അഡ്വഞ്ചർ",https://malayalamsubtitles.org/download/indian...,എംസോൺ റിലീസ് – 217 ഭാഷ ഇംഗ്ലീഷ് സംവിധാനം Steve...
4,216 MSONE GOLD RELEASE,Indiana Jones and the Last Crusade,ഇൻഡിയാന ജോൺസ് ആൻഡ് ദ ലാസ്റ്റ് ക്രൂസേഡ്,1989,english,Indiana Jones and the Last Crusade / ഇൻഡിയാന ജ...,ഇംഗ്ലീഷ്,Steven Spielberg,Vishnu Prasad,വിഷ്ണു പ്രസാദ്,"Action, Adventure, English, Msone Gold",https://malayalamsubtitles.org/languages/engli...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt0097576/,8.2/10,"ആക്ഷൻ, അഡ്വഞ്ചർ",https://malayalamsubtitles.org/download/indian...,എംസോൺ റിലീസ് – 216 MSONE GOLD RELEASE ഭാഷ ഇംഗ്...
5,3331,Postman to Heaven,പോസ്റ്റ്മാൻ ടു ഹെവൻ,2009,korean,Postman to Heaven / പോസ്റ്റ്മാൻ ടു ഹെവൻ (2009)...,കൊറിയൻ,Lee Hyeong-min,Aravind Kumar,അരവിന്ദ് കുമാർ,"Drama, Fantasy, Korean, Romance",https://malayalamsubtitles.org/languages/korea...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt2012626/,6.6/10,"ഡ്രാമ, ഫാന്റസി, റൊമാൻസ്",https://malayalamsubtitles.org/download/postma...,എംസോൺ റിലീസ് – 3331 ഭാഷ കൊറിയൻ സംവിധാനം Lee Hy...
6,215 MSONE GOLD RELEASE,Indiana Jones and the Temple of Doom,ഇൻഡിയാന ജോൺസ് ആൻഡ് ദ ടെമ്പിൾ ഓഫ് ഡൂം,1984,english,Indiana Jones and the Temple of Doom / ഇൻഡിയാന...,ഇംഗ്ലീഷ്,Steven Spielberg,Vishnu Prasad,വിഷ്ണു പ്രസാദ്,"Action, Adventure, English, Msone Gold",https://malayalamsubtitles.org/languages/engli...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt0087469/,7.5/10,"ആക്ഷൻ, അഡ്വഞ്ചർ",https://malayalamsubtitles.org/download/indian...,എംസോൺ റിലീസ് – 215 MSONE GOLD RELEASE ഭാഷ ഇംഗ്...
7,213 MSONE GOLD RELEASE,Indiana Jones and the Raiders of the Lost Ark,ഇൻഡിയാന ജോൺസ് ആൻഡ് ദ റെയ്ഡേഴ്സ് ഓഫ് ദ ലോസ്റ്റ്...,1981,english,Indiana Jones and the Raiders of the Lost Ark ...,ഇംഗ്ലീഷ്,Steven Spielberg,Vishnu Prasad,വിഷ്ണു പ്രസാദ്,"Action, Adventure, English, Msone Gold",https://malayalamsubtitles.org/languages/engli...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt0082971/,8.4/10,"ആക്ഷൻ, അഡ്വഞ്ചർ",https://malayalamsubtitles.org/download/indian...,എംസോൺ റിലീസ് – 213 MSONE GOLD RELEASE ഭാഷ ഇംഗ്...
8,3325 MSONE GOLD RELEASE,Lupin III: The Castle of Cagliostro,ലൂപാന്‍ III: ദ കാസില്‍ ഓഫ് കാഗ്ലിയോസ്ട്രോ,1979,japanese,Lupin III: The Castle of Cagliostro / ലൂപാന്‍ ...,ജാപ്പനീസ്,Hayao Miyazaki,Elvin John Paul,എല്‍വിന്‍ ജോണ്‍ പോള്‍,"Action, Adventure, Animation, Japanese, Msone ...",https://malayalamsubtitles.org/languages/japan...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt0079833/,7.6/10,"ആക്ഷൻ, അഡ്വഞ്ചർ, അനിമേഷന്‍",https://malayalamsubtitles.org/download/lupin-...,എംസോൺ റിലീസ് – 3325 MSONE GOLD RELEASE ഭാഷ ജാപ...
9,3324,Death's Game [K-Drama],ഡെത്ത്സ് ഗെയിം [കെ-ഡ്രാമ],2023,korean,Death's Game [K-Drama] / ഡെത്ത്സ് ഗെയിം [കെ-ഡ്...,കൊറിയൻ,Byung-Hoon Ha,"Aravind Kumar, Sajith TS",സജിത്ത് ടി. എസ് & അരവിന്ദ് കുമാർ,"Drama, Fantasy, Korean, Web Series",https://malayalamsubtitles.org/languages/korea...,https://malayalamsubtitles.org/wp-content/uplo...,https://www.imdb.com/title/tt26225038/,8.6/10,"ഡ്രാമ, ഫാന്റസി",https://malayalamsubtitles.org/download/deaths...,എംസോൺ റിലീസ് – 3324 ഭാഷ കൊറിയൻ സംവിധാനം Byung-...


Data has been saved to 'scraped_data.xlsx'
