In [7]:
import json
import requests
import pandas as pd
import os 
from bs4 import BeautifulSoup
ssl_verify = True

In [8]:
def get_titles_info(title_id: str):
    title_info_dict = {
        "title": "",
        "overview": "",
        "genres": [],
        "cast": [],
        "content_is": [],
        "subtitles": [],
        "audio": [],
    }

    request_url = f"https://www.netflix.com/title/{title_id}"
    response = requests.get(
        request_url, 
        verify=ssl_verify
    )
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    if soup:
        title_h1 = soup.find("h1", class_="title-title")
        if title_h1:
            title_info_dict.update({
                "title": title_h1.text
            })

        overview_div = soup.find("div", class_="title-info-synopsis")
        if overview_div:
            title_info_dict.update({            
                "overview": overview_div.text
            })

        genre_div = soup.find("div", "more-details-cell cell-genres")
        if genre_div:
            title_info_dict.update({
                "genres": [spn.text.replace(',', '').strip() for spn in genre_div.findAll('span')],
            })
            

        cast_div = soup.find("div", "more-details-cell cell-cast")
        if cast_div:
            title_info_dict.update({
                "cast": [spn.text.replace(',', '').strip() for spn in cast_div.findAll('span')]
            })


        content_mood_div = soup.find("div", "more-details-cell cell-mood-tag")
        if content_mood_div:
            title_info_dict.update({
                "content_is": [spn.text.replace(',', '').strip() for spn in content_mood_div.findAll('span')],
            })

        subtitles_div = soup.find("div", "more-details-cell cell-subtitle")
        if subtitles_div:
            title_info_dict.update({
                "subtitles": [spn.text.replace(',', '').strip() for spn in subtitles_div.findAll('span')],
            })

        audio_div = soup.find("div", "more-details-cell cell-audio")
        if subtitles_div:
            title_info_dict.update({
                "audio": [spn.text.replace(',', '').strip() for spn in audio_div.findAll('span')],
            })

    return title_info_dict

In [9]:
def request_netflix_titles(
        params,
        cookies, 
        headers,
        genre_id = "83",
        from_ix = 0,
        to_ix = 100,
    ):

    genre_item_summ_path = f'["genres",{genre_id},"az",{{"from":{from_ix},"to":{to_ix}}},"itemSummary"]'
    genre_item_ref_summ_path = f'["genres",{genre_id},"az",{{"from":{from_ix},"to":{to_ix}}},"reference",["availability","episodeCount","inRemindMeList","queue","summary"]]'
    data = {
        'path': [
            genre_item_summ_path,
            genre_item_ref_summ_path,
        ]
    }

    response = requests.post(
        'https://www.netflix.com/nq/website/memberapi/v8c3bbffe/pathEvaluator',
        params=params,
        cookies=cookies,
        headers=headers,
        data=data,
        verify=ssl_verify
    )

    print(f"fetch status code --> {response.status_code}")
    return response

In [10]:
def request_netflix_episodes_info(title_id, cookies, headers):

    params = {
        'movieid': title_id,
        'imageFormat': 'webp',
        'withSize': 'true',
        'materialize': 'true'
    }

    response = requests.get(
        'https://www.netflix.com/nq/website/memberapi/v8b5020b9/metadata',
        params=params,
        cookies=cookies,
        headers=headers,
        verify=ssl_verify
    )

    if response.status_code != 200:
        print("No episodes or seasons metadata were found")
        return {}
        
    return response.json()


def process_title_episodes_data(title_id, cookies, headers):
    season_episodes_info_list = []
    
    result_data = request_netflix_episodes_info(
        title_id=title_id,
        cookies=cookies,
        headers=headers
    )

    if result_data:
        video_info_dict = result_data["video"]
        for season_info in video_info_dict["seasons"]:
            season_year = season_info["year"]
            season_name = season_info["title"]
            season_id = season_info["id"]
            season_sequence = season_info["seq"]
            
            for ix, episode in enumerate(season_info["episodes"]):
                season_episodes_info_list.append({
                    "season_id": season_id,
                    "season_name": season_name,
                    "season_release_year": season_year,
                    "season_sequence": season_sequence,
                    "episode_id": episode["id"],
                    "episode_runtime": episode["runtime"],
                    "episode_sequence": ix+1,
                    "episode_name": episode["title"],
                    "episode_description": episode["synopsis"],
                    "episode_poster_url": episode["stills"][0]["url"] if episode.get("stills") else None
                })
            
    return season_episodes_info_list

In [11]:
def process_titles_data(genre_id, summary_data, cookies, headers):

    title_info_list = []

    for ix, item_summ in summary_data["jsonGraph"]["genres"][genre_id]['az'].items():
        title_info_dict = {}

        if not item_summ["itemSummary"].get("value"):
            print("No data is present, stopping here")
            break

        item_summ_val = item_summ["itemSummary"]["value"]

        print(f"processing title {int(ix)+1}: id -> {item_summ_val['id']}")

        misc_title_info_dict = get_titles_info(str(item_summ_val["id"]))
        
        title_episodes_info_list = process_title_episodes_data(
            str(item_summ_val["id"]), 
            cookies=cookies, 
            headers=headers
        )


        title_info_dict.update({
            "id": str(item_summ_val["id"]),
            "title": item_summ_val["title"],
            "type": item_summ_val["type"],
            "release_year": item_summ_val["releaseYear"],
            "runtime": item_summ_val.get("infoDensityRuntime"),
            "backdrop_path": item_summ_val["boxArt"]["url"],
            "made_by_netflix": item_summ_val["isOriginal"],
            "maturity_rating": item_summ_val["maturity"]["rating"]["value"],
            "maturity_warning": item_summ_val["maturity"]["rating"]["maturityDescription"],
            "maturity_rating_reason": [ix for ix in item_summ_val["maturity"]["rating"]["specificRatingReason"].split(', ')],
            "maturity_rating_board": item_summ_val["maturity"]["rating"]["board"],
        })

        title_info_dict = {**title_info_dict, **misc_title_info_dict}

        for episode_info in title_episodes_info_list:
            episode_info.update(title_info_dict)

        
        title_info_list.extend(title_episodes_info_list)

    return title_info_list

In [12]:
def main():

    from_ix = 0
    to_ix = 10
    genre_id = "83"
    file_name = "netflix_tv_shows_from_{from_ix}_to_{to_ix}.csv"
    
    params = {
        'webp': 'true',
        'drmSystem': 'widevine',
        'isVolatileBillboardsEnabled': 'true',
        'routeAPIRequestsThroughFTL': 'false',
        'hasVideoMerchInBob': 'true',
        'hasVideoMerchInJaw': 'true',
        'falcor_server': '0.1.0',
        'withSize': 'true',
        'materialize': 'true',
        'original_path': '/shakti/mre/pathEvaluator',
    }

    cookies = {
        'SecureNetflixId': '<your_netflix_id>',
        'NetflixId': '<your_netflix_id>',
        'profilesNewSession': '2'
    }

    headers = {
        'authority': 'www.netflix.com',
        'accept': '/',
        'accept-language': 'en-GB,en;q=0.7',
        'content-type': 'application/x-www-form-urlencoded',
    }

    response = request_netflix_titles(
        params=params,
        cookies=cookies, 
        headers=headers, 
        from_ix=from_ix, 
        to_ix=to_ix,
    )

    
    if response.status_code == 200:
        summary_data = response.json()
        title_info_list = process_titles_data(
            genre_id=genre_id, 
            summary_data=summary_data,
            cookies=cookies, 
            headers=headers, 

        )

        df = pd.DataFrame.from_records(title_info_list)


        # dumping list dict to csv 
        if not df.empty:
            csv_foldername = "csv_files"
            if not os.path.isdir(csv_foldername):
                os.makedirs(csv_foldername)
                
            csv_filename = os.path.join(
                csv_foldername, 
                 file_name.format(from_ix=from_ix, to_ix=to_ix)   
            )

            with open(f"{csv_filename.replace('.csv', '.json')}", 'w') as fi:
                json.dump(title_info_list, fi)
                
            df.to_csv(csv_filename, index=False)
            print(f"CSV file: {csv_filename} saved successfully!")
            return df
        else:
            print("nothing to save...")
            return
            
    else:
        print("failed to get data. Input correct Netflix Id or check on with correct genre_id and index")
df = main()



fetch status code --> 200
processing title 1: id -> 81166747




processing title 2: id -> 81044208




processing title 3: id -> 81684733




processing title 4: id -> 81056700




processing title 5: id -> 81382019




processing title 6: id -> 80215997




CSV file: csv_files/netflix_tv_shows_from_0_to_5.csv saved successfully!
