In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/generated/african_dataset.csv
/kaggle/input/tmdb-data/TMDB_movie_dataset_v11.csv


In [3]:
!pip install wikipedia-api tqdm unidecode wikipedia

Collecting wikipedia-api
  Downloading wikipedia_api-0.7.1.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: wikipedia-api, wikipedia
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.7.1-py3-none-any.whl size=14347 sha256=824e9ab98b36048506774a31f8b88d2e18adc6ef1298cb279165f84328ea9ef0
  Stored in directory: /root/.cache/pip/wheels/4c/96/18/b9201cc3e8b47b02b510460210cfd832ccf10c0c4dd0522962
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wik

In [4]:
import requests
from bs4 import BeautifulSoup
import wikipediaapi
import time
import re
from urllib.parse import unquote
from tqdm import tqdm
import unidecode
from datetime import datetime
import concurrent.futures
import aiohttp
import asyncio
import wikipedia
import wikipedia.wikipedia
wikipedia.wikipedia.BeautifulSoup = lambda html: BeautifulSoup(html, 'lxml')

In [110]:
def setup_wiki():
    """Setup Wikipedia API with a custom user agent"""
    user_agent = "AfricanMoviesDataset/1.0 (nathan.mbugua@strathmore.edu)"
    wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        user_agent=user_agent
    )
    return wiki

In [111]:
def fetch_page_content(url):
    """Fetch page content with error handling"""
    headers = {"User-Agent": "AfricanMoviesDataset/1.0 (nathan.mbugua@strathmore.edu)"}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Failed to fetch URL: {url} (Error: {e})")
        return None

In [112]:
def get_movie_plot(wiki, movie_url):
    """Extract plot from movie page"""
    if not movie_url:
        return None

    try:
        page_title = movie_url.split("/wiki/")[-1]
        page_title = unquote(page_title)
        page = wiki.page(page_title)
        if page.exists():
            plot_sections = ['Plot', 'Subject', 'Synopsis', 'Description', 'Story']
            for section in page.sections:
                if section.title in plot_sections:
                    return section.text
            
            # Fallback: If ano plot section found and page is short, use content available
            if len(page.text) < 5000:
                # Get the raw HTML content of the movie page using requests
                url = f"https://en.wikipedia.org/wiki/{page_title}"
                response = requests.get(url)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
            
                    content_div = soup.find('div', {'id': 'mw-content-text'})
            
                    if content_div:
                        # Find all <p> tags within the div
                        p_tags = content_div.find_all('p')
                
                        # If there are at least two <p> tags, use the text from them
                        if len(p_tags) >= 2:
                            plot_text = p_tags[0].get_text(strip=True) + " " + p_tags[1].get_text(strip=True)
                            return plot_text
                        # If only one <p> tag is found, return its text
                        elif len(p_tags) > 0:
                            return p_tags[0].get_text(strip=True)
                    
    except Exception as e:
        print(f"Error fetching plot for {movie_url}: {e}")
    
    return None

In [113]:
def extract_release_date(text, current_date=None):
    """Extract release date from text with various formats"""
    # Try year in parentheses
    year_match = re.search(r'\((\d{4})\)', text)
    if year_match:
        return year_match.group(1)
    
    # Try isolated 4-digit year
    year_match = re.search(r'\b(\d{4})\b', text)
    if year_match:
        return year_match.group(1)
    
    return current_date

In [114]:
def clean_page_content(soup):
    """Remove unwanted sections and tables from the page"""
    if not soup:
        return soup
        
    # Remove external links sections
    for table_class in ['nowraplinks', 'sidebar', 'navigation']:
        external_sections = soup.find_all('table', {'class': table_class})
        for section in external_sections:
            section.extract()
            
    # Remove navigation boxes
    nav_boxes = soup.find_all('div', {'class': ['navbox', 'vertical-navbox']})
    for nav_box in nav_boxes:
        nav_box.extract()
        
    # Remove reference sections
    ref_sections = soup.find_all('div', {'class': 'reflist'})
    for ref_section in ref_sections:
        ref_section.extract()
        
    return soup

In [115]:
def process_table_movies(table, country_url, use_url_for_year=False):
    """Process movies from a table structure"""
    movies = []
    if not table:
        return movies

    # Clean the table content first
    table = clean_page_content(BeautifulSoup(str(table), 'html.parser'))
    if not table:
        return movies

    rows = table.find_all('tr')
    if not rows:
        return movies

    # Find title and release date columns
    headers = rows[0].find_all('th')
    title_idx = None
    date_idx = None
    
    for idx, header in enumerate(headers):
        header_text = header.get_text().lower()
        if ("title" in header_text or "film" in header_text) and "film genre" not in header_text:
            title_idx = idx
        if any(date_term in header_text for date_term in ["year", "date", "release"]):
            date_idx = idx

    if title_idx is None:
        print(f"No title column found in {country_url}")
        return movies

    # Extract year from the country_url if needed
    year_from_url = None
    if use_url_for_year:
        match = re.search(r'(\d{4})$', country_url)
        if match:
            year_from_url = match.group(1)

    current_date = None
    for row in rows[1:]:
        cols = row.find_all(['td', 'th'])

        # Check for year row
        if len(cols) == 1 and 'colspan' in cols[0].attrs:
            current_date = cols[0].get_text(strip=True)
            continue

        if len(cols) > title_idx:
            title_col = cols[title_idx]
            link = title_col.find('a')
            
            if link and link.get('href') and "redlink=1" not in link['href']:
                movie_url = "https://en.wikipedia.org" + link['href']
                release_date = None
                
                if date_idx is not None and len(cols) > date_idx:
                    release_date = cols[date_idx].get_text(strip=True)
                if not release_date:
                    release_date = current_date
                if not release_date and year_from_url:
                    release_date = year_from_url

                movies.append({
                    'Title': re.sub(r'\[.*?\]', '', title_col.get_text(strip=True)).strip(),
                    'Release Date': release_date,
                    'URL': movie_url
                })

    return movies

In [128]:
def process_list_movies(content_section, in_main_page=False):
    """Process movies from a list structure"""
    movies = []
    if not content_section:  # Add this check
        return movies

    # Clean the content section first
    content_section = clean_page_content(BeautifulSoup(str(content_section), 'html.parser'))
    if not content_section:
        return movies
        
    movie_items = []

    for ul in content_section.find_all('ul'):
        # Check if the previous h2 is not one of the excluded sections
        prev_h2 = ul.find_previous('h2')
        if not prev_h2 or prev_h2.get('id') not in ['See_also', 'References', 'External_links']:
            movie_items.extend(ul.find_all('li'))
    
    for item in movie_items:
        link = item.find('a')
        if link and link.get('href'):
            href = link.get('href')
            if "redlink=1" not in href and href.startswith('/wiki/'):
                movie_url = "https://en.wikipedia.org" + href
                title = link.get_text(strip=True)
                text_after_link = item.get_text(strip=True).replace(title, "", 1).strip()

                release_date = extract_release_date(text_after_link)
                director = None

                # Extract director if present
                if "by" in text_after_link:
                    director_match = re.search(r'by\s+([^(]+)', text_after_link)
                    if director_match:
                        director = director_match.group(1).strip()

                movie_data = {
                    'Title': title,
                    'Release Date': release_date,
                    'URL': movie_url,
                    'Director': director
                }
                if in_main_page:
                    wiki = setup_wiki()
                    movie_data['Plot'] = get_movie_plot(wiki, movie_url)
                
                movies.append(movie_data)
    
    return movies

In [142]:
def process_nigerian_style_page(soup):
    """Process Nigerian-style pages with year links"""
    movies = []
    if not soup:
        return movies
        
    # Clean the page content first
    soup = clean_page_content(soup)

    # Find year links in the sidebar or content
    year_patterns = [
        r'List_of_Nigerian_films_of_\d{4}',
        r'List_of_Nigerian_films_of_the_\d{4}s'
    ]
    
    for pattern in year_patterns:
        year_links = soup.find_all('a', href=re.compile(pattern))
        for link in year_links:
            if 'redlink=1' not in link.get('href', ''):
                year_url = f"https://en.wikipedia.org{link['href']}"
                year_soup = fetch_page_content(year_url)
                if year_soup:
                    # Clean the year page content
                    year_soup = clean_page_content(year_soup)
                    # Process movies from the year page
                    year_movies = process_table_movies(
                        year_soup.find('table', {'class': 'wikitable'}),
                        year_url,
                        use_url_for_year=True
                    )
                    movies.extend(year_movies)
                time.sleep(0.2)

    return movies

In [143]:
def process_egyptian_style_page(soup):
    """Process Egyptian-style pages with multiple 'hlist' divs containing year links."""
    movies = []
    if not soup:
        return movies

    # Clean the page content first
    soup = clean_page_content(soup)

    # Find all "hlist" divs containing the year links
    hlist_divs = soup.find_all('div', {'class': 'hlist'})
    if not hlist_divs:
        return movies

    # Loop through each hlist div and extract year links
    for hlist_div in hlist_divs:
        year_links = hlist_div.find_all('a', href=True)
        for link in year_links:
            # Skip redlinks by checking if 'redlink=1' is in the href
            if 'redlink=1' in link['href']:
                continue  # Skip processing for redlink
            
            year_url = f"https://en.wikipedia.org{link['href']}"
            year_soup = fetch_page_content(year_url)
            if year_soup:
                # Clean the year page content
                year_soup = clean_page_content(year_soup)
                # Process movies from the year page
                year_movies = process_table_movies(
                    year_soup.find('table', {'class': 'wikitable'}),
                    year_url,
                    use_url_for_year=True
                )
                movies.extend(year_movies)
            time.sleep(0.2)  # Add delay to prevent being blocked by Wikipedia

    return movies


In [139]:
def process_main_page_movies(soup):
    """Process movies listed directly on the main page under country sections."""
    all_movies = []
    wiki = setup_wiki()
    
    # Find all sections containing country headers
    sections = soup.find_all('div', {'class': 'mw-heading'})
    
    # Filter out unwanted sections
    filtered_sections = [
        section for section in sections
        if section.find('h2') and section.find('h2').get('id') not in ['See_also', 'External_links', 'References']
    ]
    
    for section in filtered_sections:
        # Get country header
        header = section.find('h2')
        if not header:
            continue
            
        country_link = header.find('a')
        if not country_link:
            continue
            
        country = country_link.text.strip()
        if not country:
            continue
                    
        # Find the next sibling for either a table or a list
        next_section = section.find_next_sibling()
        if next_section:
            # Check for tables
            if next_section.name == 'table' and 'wikitable' in next_section.get('class', []):
                table = next_section
                movies = []
                rows = table.find_all('tr')[1:]  # Skip header row
                
                for row in rows:
                    cols = row.find_all(['td', 'th'])
                    if len(cols) >= 2:  # Need at least year and title
                        year = cols[0].text.strip()
                        title_col = cols[1]
                        director = cols[2].text.strip() if len(cols) > 2 else None
                        
                        link = title_col.find('a')
                        if link and link.get('href') and "redlink=1" not in link['href']:
                            movie_url = "https://en.wikipedia.org" + link['href']
                            title = title_col.text.strip()
                            print(f"Processing: {title} ({year})")
                            
                            movies.append({
                                'Title': title,
                                'Release Date': year,
                                'URL': movie_url,
                                'Plot': get_movie_plot(wiki, movie_url),
                                'Director': director,
                                'Country': country
                            })
                all_movies.extend(movies)
            
            # Check for lists
            elif next_section.name == 'ul':
                list_movies = process_list_movies(next_section, in_main_page=True)
                for movie in list_movies:
                    movie['Country'] = country
                all_movies.extend(list_movies)
    
    return all_movies

In [144]:
def create_african_movies_dataset():
    """Main function to create the dataset"""
    wiki = setup_wiki()
    base_url = "https://en.wikipedia.org/wiki/List_of_African_films"
    all_movies = []
    
    # Get main page
    soup = fetch_page_content(base_url)
    if not soup:
        return pd.DataFrame()

    soup = clean_page_content(soup)

    # Process all country links
    content_section = soup.find('div', {'id': 'mw-content-text'})
    if not content_section:
        return pd.DataFrame()

    # Process movies listed directly on the main page
    main_page_movies = process_main_page_movies(soup)
    all_movies.extend(main_page_movies)

    # Modified this part to better filter country links
    country_links = []
    for link in content_section.find_all('a', href=re.compile(r'List_of_.*_films')):
        href = link.get('href', '')
        # Skip edit links, redlinks, and specific unwanted links
        if ('redlink=1' not in href and 
            'action=edit' not in href and 
            link.text.strip() != 'edit' and 
            not href.startswith('#See_also') and 
            not href.startswith('#References') and 
            not href.startswith('#External_links')):
            country_links.append(link)
    
    for link in country_links:
        href = link.get('href', '')
        country = link.text.strip()
        
        # Skip if country is empty or just "edit"
        if not country or country.lower() == 'edit':
            continue
            
        print(f"Processing country: {country}")
        
        # Movies are in separate page
        country_url = f"https://en.wikipedia.org{href}"
        print(country_url)
        country_soup = fetch_page_content(country_url)
            
        if country_soup:
            if 'Nigerian' in country_url:
                movies = process_nigerian_style_page(country_soup)
            elif 'Egyptian' in country_url:
                movies = process_egyptian_style_page(country_soup)
            elif 'Burkinab' in country_url:
                table_movies = process_table_movies(
                    country_soup.find_all('table', {'class': 'wikitable'}),
                    country_url
                )
                movies = table_movies
            else:
                table_movies = process_table_movies(
                    country_soup.find_all('table', {'class': 'wikitable'}),
                    country_url
                )
                list_movies = process_list_movies(
                    country_soup.find('div', {'class': 'mw-parser-output'})
                )
                movies = table_movies + list_movies

            # Add country to movies from separate pages
            for movie in movies:
                movie['Country'] = country

            # Get plots for movies
            for movie in movies:
                print(f"Processing: {movie['Title']} ({movie['Release Date'] or 'Unknown date'})")
                plot = get_movie_plot(wiki, movie['URL'])
                movie['Plot'] = plot
                all_movies.append(movie)
                time.sleep(0.2)

    df = pd.DataFrame(all_movies)
    print("Dataset created successfully")
    return df

In [None]:
all_dfs = create_african_movies_dataset()

In [151]:
all_dfs.shape
all_dfs.head(10)

Unnamed: 0,Title,Release Date,URL,Plot,Director,Country
0,The Gods Must Be Crazy,1980.0,https://en.wikipedia.org/wiki/The_Gods_Must_Be...,Xi and his San tribe live happily in the Kalah...,Jamie Uys,Botswana
1,The Gods Must Be Crazy II,1989.0,https://en.wikipedia.org/wiki/The_Gods_Must_Be...,"The film has four storylines, which run in par...",Jamie Uys,Botswana
2,Gito l'ingrat,1992.0,https://en.wikipedia.org/wiki/Gito_l%27ingrat,Gito is a Burundian student who lives in Paris...,Léonce Ngabo,Burundi
3,Amílcar Cabral,2000.0,https://en.wikipedia.org/wiki/Am%C3%ADlcar_Cab...,Amílcar Cabralis adocumentary filmdirected byA...,Ana Ramos Lisoba,Cape Verde
4,Hanami,2024.0,https://en.wikipedia.org/wiki/Hanami_(film),"Hanamiis a drama film, directed by Denise Fern...",Denise Fernandes,Cape Verde
5,White Hotel,2001.0,https://en.wikipedia.org/wiki/White_Hotel_(film),When two women with a video camera follow an A...,"Dianne Griffin, Tobi Solvang",Eritrea
6,Welcome to the Smiling Coast: Living in the Ga...,,https://en.wikipedia.org/wiki/Welcome_to_the_S...,Welcome to the Smiling Coast: Living in the Ga...,,Gambia
7,Jaha's Promise,,https://en.wikipedia.org/wiki/Jaha%27s_Promise,"Jaha's Promise, is a 2017American-Gambiandocum...",,Gambia
8,The Boy Kumasenu,1952.0,https://en.wikipedia.org/wiki/The_Boy_Kumasenu,The film tells the story of a boy called Kumas...,Sean Graham,Ghana
9,Love Brewed in the African Pot,1980.0,https://en.wikipedia.org/wiki/Love_Brewed_in_t...,The film takes place in Ghana during the colon...,Kwaw Ansah,Ghana


In [204]:
all_dfs.to_csv('../datasets/generated/african_dataset.csv', index=False)

In [10]:
tmdb_df = pd.read_csv('../datasets/TMDB_movie_dataset_v11.csv')
tmdb_df.shape

(1153446, 24)

In [8]:
african_df = pd.read_csv('../datasets/generated/african_dataset.csv')
african_df.shape

(956, 6)

In [72]:
def extract_title_variations(title):
    """
    Extract different variations of a title, including those in parentheses.
    """
    if pd.isna(title):
        return []
    
    title = str(title).lower().strip()
    variations = [title]
    
    # Extract main title (before first parenthesis)
    if '(' in title:
        main_title = title.split('(')[0].strip()
        variations.append(main_title)
        
        # Extract all text within parentheses
        parentheses_matches = re.findall(r'\((.*?)\)', title)
        variations.extend([match.strip() for match in parentheses_matches])
    
    return variations

In [98]:
def clean_title(title):
    """
    Clean and standardize movie titles.
    """
    if pd.isna(title):
        return ''
    
    # Convert to lowercase
    title = str(title).lower().strip()
    
    # Remove special characters and extra spaces
    title = re.sub(r'[^\w\s()-]', '', title)
    title = re.sub(r'\s+', ' ', title)
    
    return title.strip()

In [99]:
def preprocess_dataframes(original_df, tmdb_df):
    """
    Preprocess both dataframes by adding cleaned titles and extracting years.
    """
    # Create copies to avoid modifying original data
    original_df = original_df.copy()
    tmdb_df = tmdb_df.copy()
    
    # Clean titles
    original_df['clean_title'] = original_df['Title'].apply(clean_title)
    tmdb_df['clean_title'] = tmdb_df['title'].apply(clean_title)
    tmdb_df['clean_original_title'] = tmdb_df['original_title'].apply(clean_title)
    
    # Extract year from release_date in tmdb_df
    tmdb_df['release_year'] = tmdb_df['release_date'].apply(
        lambda x: int(x[:4]) if pd.notna(x) and len(x) >= 4 else None
    )
    
    return original_df, tmdb_df

In [91]:
def years_match(year1, year2, tolerance=2):
    """
    Check if two years are within a specified tolerance of each other
    """
    if pd.isna(year1) or pd.isna(year2):
        return True  # Accept matches without years
    try:
        return abs(int(year1) - int(year2)) <= tolerance
    except (ValueError, TypeError):
        return True

In [102]:
def find_exact_match(title, year, tmdb_df):
    """
    Find exact matches in either title or original_title.
    """
    # Check clean_title
    matches = tmdb_df[tmdb_df['clean_title'] == title]
    if not matches.empty:
        year_filtered = matches[matches['release_year'].apply(lambda x: years_match(x, year))]
        if not year_filtered.empty:
            return year_filtered.iloc[0]
    
    # Check clean_original_title
    matches = tmdb_df[tmdb_df['clean_original_title'] == title]
    if not matches.empty:
        year_filtered = matches[matches['release_year'].apply(lambda x: years_match(x, year))]
        if not year_filtered.empty:
            return year_filtered.iloc[0]
    
    return None

In [101]:
def find_first_word_match(title, year, tmdb_df):
    """
    Find matches based on the first word of the title.
    """
    first_word = title.split()[0] if title else ''
    if first_word:
        matches = tmdb_df[
            (tmdb_df['clean_title'].str.startswith(first_word + ' ', na=False)) |
            (tmdb_df['clean_original_title'].str.startswith(first_word + ' ', na=False))
        ]
        if not matches.empty:
            year_filtered = matches[matches['release_year'].apply(lambda x: years_match(x, year))]
            if not year_filtered.empty:
                return year_filtered.iloc[0]
    return None

In [103]:
def find_bracket_match(title, year, tmdb_df):
    """
    Find matches based on text before or inside brackets.
    """
    if '(' in title:
        # Try matching the part before brackets
        before_brackets = title.split('(')[0].strip()
        matches = tmdb_df[
            (tmdb_df['clean_title'] == before_brackets) |
            (tmdb_df['clean_original_title'] == before_brackets)
        ]
        if not matches.empty:
            year_filtered = matches[matches['release_year'].apply(lambda x: years_match(x, year))]
            if not year_filtered.empty:
                return year_filtered.iloc[0]
        
        # Try matching the part inside brackets
        inside_brackets = title[title.find("(")+1:title.find(")")].strip()
        matches = tmdb_df[
            (tmdb_df['clean_title'] == inside_brackets) |
            (tmdb_df['clean_original_title'] == inside_brackets)
        ]
        if not matches.empty:
            year_filtered = matches[matches['release_year'].apply(lambda x: years_match(x, year))]
            if not year_filtered.empty:
                return year_filtered.iloc[0]
    return None

In [105]:
def find_matching_movie(row, tmdb_df):
    """
    Find a matching movie using various matching strategies.
    """
    if pd.isna(row['clean_title']):
        return None
        
    year = row.get('year', None)
    title = row['clean_title']
    
    # Try different matching strategies in order
    match = find_exact_match(title, year, tmdb_df)
    if match is not None:
        return match
    
    match = find_first_word_match(title, year, tmdb_df)
    if match is not None:
        return match
    
    match = find_bracket_match(title, year, tmdb_df)
    if match is not None:
        return match
    
    return None

In [135]:
def process_movie_data(original_df, tmdb_df):
    """
    Main function to process and merge movie data.
    """
    # Preprocess dataframes
    original_df, tmdb_df = preprocess_dataframes(original_df, tmdb_df)
    
    # Initialize output dataframes
    plots_df = pd.DataFrame(columns=['id', 'plot'])
    metadata_df = pd.DataFrame(columns=[
        'id', 'title', 'vote_count', 'vote_average', 'runtime',
        'adult', 'original_language', 'popularity', 'poster_path',
        'backdrop_path', 'genres', 'production_countries',
        'release_date', 'spoken_languages', 'keywords'
    ])
    
    metadata_rows = []
    plot_rows = []
    removed_titles = []
    
    with tqdm(total=len(original_df), desc="Processing Movies", unit="movie") as progress:
        for i in range(len(original_df)):
            row = original_df.iloc[i]
            try:
                progress.set_postfix_str(f"Currently processing: {row['Title']}")
                
                if pd.isna(row['Plot']):
                    removed_titles.append(row['Title'])
                    progress.update(1)
                    continue
                
                match = find_matching_movie(row, tmdb_df)
                
                if match is not None:
                    clean_plot = re.sub(r'\[.*?\]', '', row['Plot'])
                    plot_rows.append({
                        'id': match['id'],
                        'plot': clean_plot.strip()
                    })
                
                    metadata_row = match[metadata_df.columns].copy()
                    if pd.isna(metadata_row['production_countries']):
                        metadata_row['production_countries'] = row['Country']
                    metadata_rows.append(metadata_row)
                else:
                    removed_titles.append(row['Title'])
            except Exception as e:
                print(f"Error processing row: {row['Title'] if 'Title' in row else 'Unknown'}")
                print(f"Error: {e}")
                removed_titles.append(row['Title'])
            finally:
                progress.update(1)
    
    if plot_rows:
        plots_df = pd.DataFrame(plot_rows)
    if metadata_rows:
        metadata_df = pd.DataFrame(metadata_rows)
    
    print("Done merging datasets")
    print(f"Number of rows processed: {len(original_df)}")
    print(f"Number of removed titles: {len(removed_titles)}")
    print("Removed Titles:")
    for title in removed_titles:
        print(f"- {title}")
    
    return plots_df, metadata_df

In [107]:
plots_df, metadata_df = process_movie_data(african_df, tmdb_df)

Processing Movies: 994movie [06:43,  2.46movie/s, Currently processing: A World Apart]                                                                                           


Done merging datasets
Number of rows processed: 956
Number of removed titles: 101
Removed Titles:
- Zinabu
- Goldwidows: Women in Lesotho 1991
- Bawina
- Yogera
- Sahara Occidental indépendance ou génocide?
- The Epic of Cheikh Bouamama(الشيخ بوعمامة)
- Outside the Law(خارجون عن القانون)
- L'Oranais(الوهراني)
- Delice Paloma
- Si-Gueriki, la reine-mère
- Wend Kuuni
- Ribo ou le soleil sauvage
- Enah Johnscott
- Léonie Yangba Zowe
- Camille Lepage
- Andre kolingbe
- Rançon d'une alliance, La
- Weddad
- Rossassa Fel Qalb(Bullet in the Heart)
- Berlanti(Berlanti)
- Al-Millionairah al-Saghirah(The Small Millionaire)
- Akhlaq Lil Bai(Virtue for Sale)
- Al-Ustazah Fatimah(Miss Fatimah)
- Al-Malak al-Zalem(The Unjust Angel)
- Ayyamna al-Holwa(Our Beautiful Days)
- Sayyidat al-Qasr(Lady of the Castle)
- Bidaya wa Nihaya(Beginning and End)
- Lokmet El-Aish(A Scrap of Bread)
- Salladin the Victorious(Al Nasser Salah Al Din)
- Mirati Modeer Aam(My Wife, the Director General)
- Khally Balak Min Zo

In [108]:
plots_df.shape

(855, 2)

In [109]:
plots_df.to_csv('african_movie_plots.csv', index=False)
metadata_df.to_csv('african_movie_metadata.csv', index=False)