In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import yaml

In [2]:
with open("../api_constants.yaml") as f:
    api_constants = yaml.load(f, Loader=yaml.FullLoader)

rapidapi = api_constants["RapidAPI"]

In [3]:
df = pd.read_pickle("../Data/netflix-pull-wo-clist.pickle")

In [4]:
df.columns

Index(['id', 'title', 'img', 'vtype', 'nfid', 'synopsis', 'year', 'imdbid',
       'clist', 'titledate', 'imdb_rating'],
      dtype='object')

In [5]:
more_titles = df.loc[df["clist"].str.contains("more", case=False)]

In [6]:
more_titles.shape

(7860, 11)

### 7860 calls to API would require about 79 days - need to buy the API

## Getting Country Code Mappings

In [7]:
country_df = pd.read_pickle("../Data/country_data.pickle")

In [8]:
country_df.head()

Unnamed: 0,id,country,countrycode,tvids,tseries,tmovs
0,21,Argentina,AR,6188,2248,3940
1,23,Australia,AU,6266,2370,3896
2,26,Belgium,BE,6544,2317,4227
3,29,Brazil,BR,6323,2246,4077
4,33,Canada,CA,6165,2281,3884


In [9]:
def pull(nfid):
    endpoint = "titlecountries"
    url = rapidapi["URL"] + endpoint
    querystring = {"netflixid":str(nfid)}
    headers = {
        "X-RapidAPI-Key": rapidapi["RapidAPI-JEFF-KEY"],
        "X-RapidAPI-Host": rapidapi["RapidAPI-Host"],
    }
    response = requests.request("GET", url, headers=headers, params=querystring)
    return response

In [13]:
cols = ['nfid', 'cc', 'country', 'seasdet', 'expiredate', 'newdate', 'audio', 'subtitle', 'hd', 'uhd', '3d']
res_df = pd.DataFrame(columns=cols)

In [14]:
from tqdm import tqdm

nfids = more_titles.nfid
for nfid in tqdm(nfids):
    data = pull(nfid).json()["results"]
    temp_df = pd.DataFrame(data)
    temp_df.insert(0, "nfid", nfid)
    res_df = pd.concat([res_df, temp_df])
res_df.set_index("nfid", inplace=True)

100%|██████████| 7860/7860 [59:35<00:00,  2.20it/s]  


In [15]:
res_df.to_pickle("nfid_country_partial.pickle")

## Add Country Info to Main DF

In [10]:
res_df = pd.read_pickle("nfid_country_partial.pickle")

In [11]:
def add_country_data(row):
    nfid = row["nfid"]
    cinfo_nfid = res_df.loc[nfid]
    clist = []
    for cc, country in zip(cinfo_nfid["cc"], cinfo_nfid["country"]):
        clist.append(f"{cc}:{country}")
    
    return ",".join(clist)
    
    

In [12]:
from tqdm import tqdm
tqdm.pandas()

df["clist"] = df.progress_apply(lambda row: add_country_data(row) if "more" in row["clist"] else row["clist"], axis=1)

100%|██████████| 17850/17850 [00:01<00:00, 15747.38it/s]


In [13]:
df.clist

0                                            "JP":"Japan"
1                                            "JP":"Japan"
2       "US":"United States","AU":"Australia","BR":"Br...
3       "US":"United States","AU":"Australia","BR":"Br...
4                                      "NL":"Netherlands"
                              ...                        
1846    KR:South Korea,IT:Italy ,MY:Malaysia ,MX:Mexic...
1847                                "GB":"United Kingdom"
1848                                 "US":"United States"
1849    CH:Switzerland ,IN:India ,BR:Brazil ,MX:Mexico...
1850    "CA":"Canada","FR":"France","GB":"United Kingdom"
Name: clist, Length: 17850, dtype: object

In [14]:
df.loc[df["clist"].str.contains("more", case=False)].shape

(0, 11)

In [15]:
df["clist"] = df["clist"].apply(lambda x: x.replace('"', ''))

In [18]:
df.to_pickle("netflix-data-complete.pickle")