### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np

### Extracting the 2019 movies data from the Wikipedia

In [2]:
link = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2019'
table1 = pd.read_html(link, header= 0)[3]
table2 = pd.read_html(link, header= 0)[4]
table3 = pd.read_html(link, header= 0)[5]
table4 = pd.read_html(link, header= 0)[6]

In [3]:
movies_df = table1.append(table2.append(table3.append(table4,ignore_index= True),ignore_index= True),ignore_index= True)

In [4]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2]
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3]
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4]
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5]
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6]


In [5]:
movies_df.shape

(241, 6)

In [6]:
# Getting genre column from TMDB
from tmdbv3api import TMDb
tmdb = TMDb()
tmdb.api_key = ''

In [7]:
from tmdbv3api import Movie
import requests
import json
tmdb_movie = Movie()

In [8]:
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}' .format(movie_id, tmdb.api_key))
    data = response.json()
    if data['genres']:
        genre_names = ' '
        for i in range(len(data['genres'])):
            genres.append(data['genres'][i]['name'])
        return genre_names.join(genres)
    else:
        np.NaN

In [9]:
movies_df['genres'] = movies_df['Title'].map(lambda x: get_genre(str(x)))

In [10]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2],Thriller Action Mystery Horror
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3],Thriller Drama
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4],Thriller
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5],Drama Adventure Family
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6],Comedy Drama


In [11]:
movies_df.isnull().sum()

Opening               0
Opening.1             0
Title                 0
Production company    0
Cast and crew         0
Ref.                  7
genres                1
dtype: int64

In [12]:
movies_df = movies_df.dropna()

In [13]:
movies_df.isnull().sum()

Opening               0
Opening.1             0
Title                 0
Production company    0
Cast and crew         0
Ref.                  0
genres                0
dtype: int64

In [14]:
print(type(movies_df.loc[0, 'genres']))

<class 'str'>


In [15]:
# Converting string to list in genres column
def convert(string):
    li = list(string.split(" ")) 
    return li

In [16]:
movies_df['genres'] = movies_df['genres'].map(lambda x: convert(x))

In [17]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2],"[Thriller, Action, Mystery, Horror]"
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3],"[Thriller, Drama]"
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4],[Thriller]
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5],"[Drama, Adventure, Family]"
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6],"[Comedy, Drama]"


In [18]:
movies_df.loc[0, 'genres']

['Thriller', 'Action', 'Mystery', 'Horror']

In [19]:
movies_df.loc[0, 'Cast and crew']

'Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen'

In [20]:
# Getting the cast of the movie
def get_cast(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0:3])

In [21]:
movies_df['cast'] = movies_df['Cast and crew'].map(lambda x: get_cast(x))

In [22]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres,cast
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2],"[Thriller, Action, Mystery, Horror]","[Taylor Russell, Logan Miller, Deborah Ann Woll]"
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3],"[Thriller, Drama]","[Hermione Corfield, Jay Paulson, Sean O'Bryan]"
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4],[Thriller],"[Donald Sutherland, Vincent Kartheiser, Oliver..."
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5],"[Drama, Adventure, Family]","[Bryce Dallas Howard, Edward James Olmos, Alex..."
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6],"[Comedy, Drama]","[Bryan Cranston, Kevin Hart, Nicole Kidman]"


In [23]:
movies_df.loc[9, 'cast']

['James Badge Dale', 'Brian Geraghty']

In [24]:
movies_df.columns

Index(['Opening', 'Opening.1', 'Title', 'Production company', 'Cast and crew',
       'Ref.', 'genres', 'cast'],
      dtype='object')

In [25]:
movies_df = movies_df.rename(columns= {'Title': 'original_title'})

In [26]:
# Getting the movie id from TMDB api
def get_id(x):
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    return movie_id

In [27]:
movies_df['movie_id'] = movies_df['original_title'].map(lambda x: get_id(str(x)))

In [28]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,original_title,Production company,Cast and crew,Ref.,genres,cast,movie_id
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2],"[Thriller, Action, Mystery, Horror]","[Taylor Russell, Logan Miller, Deborah Ann Woll]",522681
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3],"[Thriller, Drama]","[Hermione Corfield, Jay Paulson, Sean O'Bryan]",561362
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4],[Thriller],"[Donald Sutherland, Vincent Kartheiser, Oliver...",567738
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5],"[Drama, Adventure, Family]","[Bryce Dallas Howard, Edward James Olmos, Alex...",508763
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6],"[Comedy, Drama]","[Bryan Cranston, Kevin Hart, Nicole Kidman]",440472


In [29]:
def get_overview(x):
    result = tmdb_movie.search(x)
    movie_id = result[0].id
    movie_overview = result[0].overview
    return movie_overview

In [30]:
movies_df['overview'] = movies_df['original_title'].map(lambda x: get_overview(str(x)))

In [31]:
movies_df.head()

Unnamed: 0,Opening,Opening.1,original_title,Production company,Cast and crew,Ref.,genres,cast,movie_id,overview
0,JANUARY,4,Escape Room,Columbia Pictures,"Adam Robitel (director); Bragi F. Schut, Maria...",[2],"[Thriller, Action, Mystery, Horror]","[Taylor Russell, Logan Miller, Deborah Ann Woll]",522681,Six strangers find themselves in circumstances...
1,JANUARY,4,Rust Creek,IFC Films,Jen McGowan (director); Julie Lipson (screenpl...,[3],"[Thriller, Drama]","[Hermione Corfield, Jay Paulson, Sean O'Bryan]",561362,When an overachieving college senior makes a w...
2,JANUARY,4,American Hangman,Hangman Justice Productions,Wilson Coneybeare (director/screenplay); Donal...,[4],[Thriller],"[Donald Sutherland, Vincent Kartheiser, Oliver...",567738,An unidentified man posts a live feed on socia...
3,JANUARY,11,A Dog's Way Home,Columbia Pictures,Charles Martin Smith (director); W. Bruce Came...,[5],"[Drama, Adventure, Family]","[Bryce Dallas Howard, Edward James Olmos, Alex...",508763,A Dog’s Way Home chronicles the heartwarming a...
4,JANUARY,11,The Upside,STX Entertainment,Neil Burger (director); Jon Hartmere (screenpl...,[6],"[Comedy, Drama]","[Bryan Cranston, Kevin Hart, Nicole Kidman]",440472,Phillip is a wealthy quadriplegic who needs a ...


In [32]:
movies_2019 = movies_df.loc[:, ['genres','movie_id','original_title','overview','cast']]
movies_2019.head()

Unnamed: 0,genres,movie_id,original_title,overview,cast
0,"[Thriller, Action, Mystery, Horror]",522681,Escape Room,Six strangers find themselves in circumstances...,"[Taylor Russell, Logan Miller, Deborah Ann Woll]"
1,"[Thriller, Drama]",561362,Rust Creek,When an overachieving college senior makes a w...,"[Hermione Corfield, Jay Paulson, Sean O'Bryan]"
2,[Thriller],567738,American Hangman,An unidentified man posts a live feed on socia...,"[Donald Sutherland, Vincent Kartheiser, Oliver..."
3,"[Drama, Adventure, Family]",508763,A Dog's Way Home,A Dog’s Way Home chronicles the heartwarming a...,"[Bryce Dallas Howard, Edward James Olmos, Alex..."
4,"[Comedy, Drama]",440472,The Upside,Phillip is a wealthy quadriplegic who needs a ...,"[Bryan Cranston, Kevin Hart, Nicole Kidman]"


### Combining with old dataset

In [33]:
old = pd.read_csv("new_data18.csv")

In [34]:
new_data = old.append(movies_2019)

In [35]:
new_data.shape

(5799, 5)

In [36]:
new_data.drop_duplicates(subset= 'original_title', keep= 'last', inplace= True)

In [37]:
new_data.shape

(5784, 5)

In [40]:
new_data.isnull().sum()

genres            0
movie_id          0
original_title    0
overview          0
cast              0
dtype: int64

In [41]:
new_data.to_csv("new_data19.csv", index= False)