#__Scraping the data:__

In [None]:
# importing libraries
import requests
import json
import pandas as pd
import os
import sys
import time
import pandas as pd
import numpy as np
import ast

In [None]:
# example api request according to tmdb's api documentation: https://api.themoviedb.org/3/movie/550?api_key="API_KEY"
# obtain the api key from here: "https://www.themoviedb.org/settings/api"
API_KEY = "##############################" 

In [None]:
# declaring variables
tvid_list, image_list, created_by_list, episode_run_time_list, first_air_date_list, genres_list, networks_list, number_of_episodes_list, number_of_seasons_list = [[],[],[],[],[],[],[],[],[]]
origin_country_list, original_name_list, overview_list, popularity_list, production_companies_list, status_list = [[],[],[],[],[],[]]
tagline_list, rating_list, vote_count_list = [[],[],[]]
cast_list, crew_list = [[],[]]

In [None]:
# load the tv id's from the tmdb's json file updated as on 29th december 2021
# daily file exports run around 7:00 AM UTC, and all files are available by 8:00 AM UTC - https://developers.themoviedb.org/
data = [json.loads(line) for line in open(os.path.join(sys.path[0], "tv_series_ids_12_29_2021.json"), "r", encoding="utf8")]
TV_IDS = []
for my_dict in data:
    TV_IDS.append(my_dict['id'])

In [None]:
# iterating over each tv id (length of TV_IDS = 121906)
st = time.time()
for TV_ID in TV_IDS: 
    response = requests.get("https://api.themoviedb.org/3/tv/{}?api_key={}".format(TV_ID,API_KEY))
    if response.status_code == 200 and response.json()['original_language'] == 'en': #scraping for english tv shows
        tv_dict = response.json()

        tvid_list.append(TV_ID) 

        #getting image data
        response_img_obj = requests.get("https://api.themoviedb.org/3/tv/{}/images?api_key={}".format(TV_ID,API_KEY))
        image_dict = response_img_obj.json()
        posters_aspect_ratio,posters_file_path,posters_height,posters_width = None,None,None,None 
        if image_dict['posters'] != []:
            posters_aspect_ratio = image_dict['posters'][0]['aspect_ratio']
            posters_file_path = image_dict['posters'][0]['file_path']
            posters_height = image_dict['posters'][0]['height']
            posters_width = image_dict['posters'][0]['width']
        image_info = {'par':posters_aspect_ratio,'pfp':posters_file_path,'ph':posters_height,'pw':posters_width}
        image_list.append(image_info)

        response_credits_obj = requests.get("https://api.themoviedb.org/3/tv/{}/credits?api_key={}".format(TV_ID,API_KEY))
        if response_credits_obj.status_code == 200:
            cast_list.append(response_credits_obj.json()['cast'])             
        else:
            pass
                        
        genres_list.append(tv_dict['genres'])     
        networks_list.append(tv_dict['networks'])
        original_name_list.append(tv_dict['original_name'])
        overview_list.append(tv_dict['overview'])
        popularity_list.append(tv_dict['popularity'])
        tagline_list.append(tv_dict['tagline'])
        rating_list.append(tv_dict['vote_average'])
    else:
        pass

temp_dict = {'TV ID':tvid_list,'IMAGE INFO': image_list,
             'GENRES':genres_list,'NETWORKS':networks_list,            
             'TITLE':original_name_list,'OVERVIEW':overview_list,'POPULARITY':popularity_list,
            'TAGLINE':tagline_list,'RATING':rating_list,'CAST':cast_list}

tv_df= pd.DataFrame(temp_dict)
tv_df.set_index('TV ID',inplace=True)
tv_df.to_csv('tv_series_db.csv',mode='a',header=False) # storing scrapped data
print("time taken:",time.time()-st)

#__Preparing the Dataset__

In [None]:
tvseries = pd.read_csv('tv_series_db.csv')

In [None]:
tvseries.head()

In [None]:
# replace NaN to meaningful values
tvseries[['IMAGE INFO','GENRES','NETWORKS','OVERVIEW','TAGLINE','CAST']] = tvseries[['IMAGE INFO','GENRES','NETWORKS','OVERVIEW','TAGLINE','CAST']].fillna('')
tvseries[['POPULARITY','RATING']] = tvseries[['POPULARITY','RATING']].fillna(0)
tvseries.replace('',None)

In [None]:
tvseries[tvseries['TV ID'].isnull()].index.tolist()

In [None]:
tvseries = tvseries.drop([4124]) # dropping the null records

##Cleaning image info

In [None]:
def tocleanimageinfo(string,i):
    lst = []
    try:
        dict = ast.literal_eval(string)
        lst.append(dict)
    except SyntaxError:
        print('Syntax Error at',i)
    except ValueError:
        print('value error at',i)
    return lst
tvseries['IMAGE INFO'] = np.vectorize(tocleanimageinfo)(tvseries['IMAGE INFO'], tvseries['TV ID'])

In [None]:
print(type(tvseries.iloc[0]['IMAGE INFO']))
print(tvseries.iloc[0]['IMAGE INFO'])

##Cleaning Genres

In [None]:
tvseries.iloc[0]['GENRES']

In [None]:
def tocleangenres(tobecleaned):
    cleanlist = []
    for item in ast.literal_eval(tobecleaned):
      cleanlist.append(item['name'])
    return cleanlist
tvseries['GENRES'] = tvseries['GENRES'].apply(tocleangenres)

In [None]:
tvseries.iloc[0]['GENRES']

##Cleaning cast

In [None]:
tvseries.iloc[10]['CAST']

In [None]:
def tocleancast(tobecleaned):
  cleanlist = []
  for item in ast.literal_eval(tobecleaned):
    cleanlist.append(item['name'])
  return cleanlist
tvseries['CAST'] = tvseries['CAST'].apply(tocleancast)

In [None]:
tvseries[tvseries['TITLE']=='Friends']['CAST']

##Cleaning networks

In [None]:
print(tvseries.iloc[0]["NETWORKS"])

In [None]:
def tocleannetworks(tobecleaned):
    cleanlist = []
    for item in ast.literal_eval(tobecleaned):
      cleanlist.append(item['name'])
    return cleanlist
tvseries['NETWORKS'] = tvseries['NETWORKS'].apply(tocleannetworks)

In [None]:
tvseries[tvseries['TITLE'].isnull()].index.tolist() # drop tv series with no title

##Popularity and rating were found to be clean:

In [None]:
type(tvseries.iloc[0]['POPULARITY'])
type(tvseries.iloc[0]['RATING'])

In [None]:
tvseries['POPULARITY'].isnull().sum()
tvseries['RATING'].isnull().sum()

In [None]:
tvseries.head() # dataset is clean

#__Store the cleaned dataset__:

In [None]:
tvseries.to_csv('cleaned_tvseries.csv')