### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np

### Extracting the 2020 movies data from the Wikipedia

In [2]:
link = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2020'
table1 = pd.read_html(link, header= 0)[3]
table2 = pd.read_html(link, header= 0)[4]
table3 = pd.read_html(link, header= 0)[5]
table4 = pd.read_html(link, header= 0)[6]

In [3]:
movies_df = table1.append(table2.append(table3.append(table4,ignore_index= True),ignore_index= True),ignore_index= True)

In [4]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,3.0,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2]
1,JANUARY,10.0,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3]
2,JANUARY,10.0,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4]
3,JANUARY,10.0,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5]
4,JANUARY,10.0,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6]


In [5]:
movies_df.shape

(201, 6)

In [6]:
# Getting genre column from TMDB
from tmdbv3api import TMDb
tmdb = TMDb()
tmdb.api_key = ''

In [7]:
from tmdbv3api import Movie
import requests
import json
tmdb_movie = Movie()

In [8]:
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}' .format(movie_id, tmdb.api_key))
    data = response.json()
    if data['genres']:
        genre_names = ' '
        for i in range(len(data['genres'])):
            genres.append(data['genres'][i]['name'])
        return genre_names.join(genres)
    else:
        np.NaN

In [9]:
movies_df['genres'] = movies_df['Title'].map(lambda x: get_genre(str(x)))

In [10]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,JANUARY,3.0,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],Horror Mystery
1,JANUARY,10.0,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],Action Horror Science Fiction Thriller
2,JANUARY,10.0,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],Comedy
3,JANUARY,10.0,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],Drama Thriller Crime
4,JANUARY,10.0,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6],Horror Thriller Mystery


In [11]:
movies_df.isnull().sum()

Opening               0
Opening.1             2
Title                 2
Production company    2
Cast and crew         2
Ref.                  9
genres                3
dtype: int64

In [12]:
movies_df = movies_df.dropna()

In [13]:
movies_df.isnull().sum()

Opening               0
Opening.1             0
Title                 0
Production company    0
Cast and crew         0
Ref.                  0
genres                0
dtype: int64

In [14]:
print(type(movies_df.loc[0, 'genres']))

<class 'str'>


In [15]:
# Converting string to list in genres column
def convert(string):
    li = list(string.split(" ")) 
    return li

In [16]:
movies_df['genres'] = movies_df['genres'].map(lambda x: convert(x))

In [17]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,JANUARY,3.0,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],"[Horror, Mystery]"
1,JANUARY,10.0,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],"[Action, Horror, Science, Fiction, Thriller]"
2,JANUARY,10.0,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],[Comedy]
3,JANUARY,10.0,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],"[Drama, Thriller, Crime]"
4,JANUARY,10.0,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6],"[Horror, Thriller, Mystery]"


In [18]:
movies_df.loc[0, 'genres']

['Horror', 'Mystery']

In [19]:
movies_df.loc[0, 'Cast and crew']

'Nicolas Pesce (director/screenplay); Andrea Riseborough, Demián Bichir, John Cho, Betty Gilpin, Lin Shaye, Jacki Weaver'

In [20]:
# Getting the cast of the movie
def get_cast(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0:3])

In [21]:
movies_df['cast'] = movies_df['Cast and crew'].map(lambda x: get_cast(x))

In [22]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres,cast
0,JANUARY,3.0,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],"[Horror, Mystery]","[Andrea Riseborough, Demián Bichir, John Cho]"
1,JANUARY,10.0,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],"[Action, Horror, Science, Fiction, Thriller]","[Kristen Stewart, Vincent Cassel, Jessica Henw..."
2,JANUARY,10.0,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],[Comedy],"[Tiffany Haddish, Rose Byrne, Salma Hayek]"
3,JANUARY,10.0,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],"[Drama, Thriller, Crime]","[Josh Hartnett, Margarita Levieva, Chandler Ri..."
4,JANUARY,10.0,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6],"[Horror, Thriller, Mystery]","[Freya Tingley, Simon Abkarian, Rutger Hauer]"


In [23]:
movies_df.loc[18, 'cast']

['Winslow Fegley', 'Ophelia Lovibond', 'Craig Robinson']

In [24]:
movies_df.columns

Index(['Opening', 'Opening.1', 'Title', 'Production company', 'Cast and crew',
       'Ref.', 'genres', 'cast'],
      dtype='object')

In [25]:
movies_df = movies_df.rename(columns= {'Title': 'original_title'})

In [26]:
# Getting the movie id from TMDB api
def get_id(x):
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    return movie_id

In [27]:
movies_df['movie_id'] = movies_df['original_title'].map(lambda x: get_id(str(x)))

In [28]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,original_title,Production company,Cast and crew,Ref.,genres,cast,movie_id
0,JANUARY,3.0,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],"[Horror, Mystery]","[Andrea Riseborough, Demián Bichir, John Cho]",465086
1,JANUARY,10.0,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],"[Action, Horror, Science, Fiction, Thriller]","[Kristen Stewart, Vincent Cassel, Jessica Henw...",443791
2,JANUARY,10.0,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],[Comedy],"[Tiffany Haddish, Rose Byrne, Salma Hayek]",526019
3,JANUARY,10.0,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],"[Drama, Thriller, Crime]","[Josh Hartnett, Margarita Levieva, Chandler Ri...",634904
4,JANUARY,10.0,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6],"[Horror, Thriller, Mystery]","[Freya Tingley, Simon Abkarian, Rutger Hauer]",477036


In [29]:
def get_overview(x):
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    movie_overview = result[0].overview
    return movie_overview

In [30]:
movies_df['overview'] = movies_df['original_title'].map(lambda x: get_overview(str(x)))

In [31]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,original_title,Production company,Cast and crew,Ref.,genres,cast,movie_id,overview
0,JANUARY,3.0,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],"[Horror, Mystery]","[Andrea Riseborough, Demián Bichir, John Cho]",465086,After a young mother murders her family in her...
1,JANUARY,10.0,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],"[Action, Horror, Science, Fiction, Thriller]","[Kristen Stewart, Vincent Cassel, Jessica Henw...",443791,After an earthquake destroys their underwater ...
2,JANUARY,10.0,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],[Comedy],"[Tiffany Haddish, Rose Byrne, Salma Hayek]",526019,Two female friends with very different ideals ...
3,JANUARY,10.0,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],"[Drama, Thriller, Crime]","[Josh Hartnett, Margarita Levieva, Chandler Ri...",634904,"Since the death of their father, the Riley sib..."
4,JANUARY,10.0,The Sonata,Screen Media Films,Andrew Desmond (director/screenplay); Arthur M...,[6],"[Horror, Thriller, Mystery]","[Freya Tingley, Simon Abkarian, Rutger Hauer]",477036,After a gifted musician inherits a mansion aft...


In [32]:
movies_2020 = movies_df.loc[:, ['genres','movie_id','original_title','overview','cast']]
movies_2020.head()

Unnamed: 0,genres,movie_id,original_title,overview,cast
0,"[Horror, Mystery]",465086,The Grudge,After a young mother murders her family in her...,"[Andrea Riseborough, Demián Bichir, John Cho]"
1,"[Action, Horror, Science, Fiction, Thriller]",443791,Underwater,After an earthquake destroys their underwater ...,"[Kristen Stewart, Vincent Cassel, Jessica Henw..."
2,[Comedy],526019,Like a Boss,Two female friends with very different ideals ...,"[Tiffany Haddish, Rose Byrne, Salma Hayek]"
3,"[Drama, Thriller, Crime]",634904,Inherit the Viper,"Since the death of their father, the Riley sib...","[Josh Hartnett, Margarita Levieva, Chandler Ri..."
4,"[Horror, Thriller, Mystery]",477036,The Sonata,After a gifted musician inherits a mansion aft...,"[Freya Tingley, Simon Abkarian, Rutger Hauer]"


### Combining with old dataset

In [33]:
old = pd.read_csv("new_data19.csv")

In [34]:
new_data = old.append(movies_2020)

In [35]:
new_data.shape

(5975, 5)

In [36]:
new_data.drop_duplicates(subset= 'original_title', keep= 'last', inplace= True)

In [37]:
new_data.shape

(5969, 5)

In [39]:
new_data.isnull().sum()

genres            0
movie_id          0
original_title    0
overview          0
cast              0
dtype: int64

In [40]:
new_data.to_csv("main_data.csv", index= False)