## Extracting features of 2021 movies from Wikipedia

In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"

In [3]:
source = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(source, 'lxml')

In [4]:
tables = soup.find_all('table', class_='wikitable sortable')

In [5]:
len(tables)

4

In [6]:
type(tables[0])

bs4.element.Tag

In [7]:
df1 = pd.read_html(str(tables[0]))[0]
df2 = pd.read_html(str(tables[1]))[0]
df3 = pd.read_html(str(tables[2]))[0]
df4 = pd.read_html(str(tables[3]))[0]

In [8]:
df = df1.append(df2.append(df3.append(df4, ignore_index=True), ignore_index=True), ignore_index=True)

In [9]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.
0,JANUARY,1.0,Shadow in the Cloud,Vertical Entertainment,Roseanne Liang (director/screenplay); Max Land...,[2],
1,JANUARY,13.0,The White Tiger,Netflix,Ramin Bahrani (director/screenplay); Adarsh Go...,,
2,JANUARY,14.0,Locked Down,HBO Max / Warner Bros. Pictures,Doug Liman (director); Steven Knight (screenpl...,[3],
3,JANUARY,15.0,The Dig,Netflix / Clerkenwell Films,Simon Stone (director); Moira Buffini (screenp...,[4],
4,JANUARY,15.0,Outside the Wire,Netflix,"Mikael Håfström (director); Rob Yescombe, Rowa...",[5],
...,...,...,...,...,...,...,...
352,DECEMBER,25.0,The Tragedy of Macbeth,Apple TV+ / A24 / IAC Films,Joel Coen (director/screenplay); Denzel Washin...,,[270]
353,DECEMBER,25.0,A Journal for Jordan,Columbia Pictures / Escape Artists / Bron Studios,Denzel Washington (director); Virgil Williams ...,,[271]
354,DECEMBER,25.0,American Underdog,Lionsgate,"Erwin brothers (directors); Jon Erwin, David A...",,[272]
355,DECEMBER,26.0,Memoria,Neon,Apichatpong Weerasethakul (director/acreenplay...,,[273]


In [10]:
df_2021 = df[['Title','Cast and crew']]

In [12]:
df_2021

Unnamed: 0,Title,Cast and crew
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...
3,The Dig,Simon Stone (director); Moira Buffini (screenp...
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa..."
...,...,...
352,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...
353,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...
354,American Underdog,"Erwin brothers (directors); Jon Erwin, David A..."
355,Memoria,Apichatpong Weerasethakul (director/acreenplay...


In [14]:
!pip install tmdbv3api

Collecting tmdbv3api
  Downloading tmdbv3api-1.7.6-py2.py3-none-any.whl (17 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.7.6


In [15]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = 'bce8b94968a79cd90c7d3cbe5005203b'

In [26]:
from tmdbv3api import Movie
tmdb_movie = Movie()
def get_genre(x):
  genres = []
  result = tmdb_movie.search(x)
  if not result:
    return np.NaN
  else:
    movie_id = result[0].id
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, tmdb.api_key))
    data_json = response.json()
    if data_json['genres']:
      genre_str = " "
      for i in range(0,len(data_json['genres'])):
        genres.append(data_json['genres'][i]['name'])
      return genre_str.join(genres)
    else:
      return np.NaN

In [27]:
df_2021['genres'] = df_2021['Title'].map(lambda x: get_genre(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
df_2021

Unnamed: 0,Title,Cast and crew,genres
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Drama
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction
...,...,...,...
352,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...,Drama War
353,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...,Drama Romance
354,American Underdog,"Erwin brothers (directors); Jon Erwin, David A...",Drama
355,Memoria,Apichatpong Weerasethakul (director/acreenplay...,Drama Fantasy Mystery


In [39]:
df['Cast and crew'].head()

0    Roseanne Liang (director/screenplay); Max Land...
1    Ramin Bahrani (director/screenplay); Adarsh Go...
2    Doug Liman (director); Steven Knight (screenpl...
3    Simon Stone (director); Moira Buffini (screenp...
4    Mikael Håfström (director); Rob Yescombe, Rowa...
Name: Cast and crew, dtype: object

In [30]:
def get_director(x):
  if " (director)" in x:
    return x.split(" (director)")[0]
  elif " (directors)" in x:
    return x.split(" (directors)")[0]
  else:
    return x.split(" (director/screenplay)")[0]

In [31]:
df_2021['director_name'] = df_2021['Cast and crew'].map(lambda x: get_director(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
df_2021['director_name']

0                                         Roseanne Liang
1                                          Ramin Bahrani
2                                             Doug Liman
3                                            Simon Stone
4                                        Mikael Håfström
                             ...                        
352                                            Joel Coen
353                                    Denzel Washington
354                                       Erwin brothers
355    Apichatpong Weerasethakul (director/acreenplay...
356                                                  nan
Name: director_name, Length: 357, dtype: object

In [49]:
df['Cast and crew'].head(1)[0]

'Roseanne Liang (director/screenplay); Max Landis (screenplay); Chloë Grace Moretz, Taylor John Smith, Beulah Koale, Callan Mulvey, Nick Robinson'

In [50]:
df['Cast and crew'][355]

'Apichatpong Weerasethakul (director/acreenplay); Tilda Swinton, Elkin Díaz, Jeanne Balibar, Juan Pablo Urrego, Daniel Giménez Cacho'

In [54]:
def get_actor1(x):
  if "screenplay); " in x:
    return ((x.split("screenplay); ")[-1]).split(", ")[0])
  else:
    return ((x.split("acreenplay); ")[-1]).split(", ")[0])

In [55]:
df_2021['actor_1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [56]:
df_2021['actor_1_name']

0      Chloë Grace Moretz
1           Adarsh Gourav
2           Anne Hathaway
3          Carey Mulligan
4          Anthony Mackie
              ...        
352     Denzel Washington
353     Michael B. Jordan
354          Zachary Levi
355         Tilda Swinton
356                   nan
Name: actor_1_name, Length: 357, dtype: object

In [57]:
df['Cast and crew'].head(1)[0]

'Roseanne Liang (director/screenplay); Max Landis (screenplay); Chloë Grace Moretz, Taylor John Smith, Beulah Koale, Callan Mulvey, Nick Robinson'

In [58]:
df['Cast and crew'][355]

'Apichatpong Weerasethakul (director/acreenplay); Tilda Swinton, Elkin Díaz, Jeanne Balibar, Juan Pablo Urrego, Daniel Giménez Cacho'

In [59]:
def get_actor2(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
    return np.NaN
  elif "screenplay); " in x:
    return ((x.split("screenplay); ")[-1]).split(", ")[1])
  else:
    return ((x.split("acreenplay); ")[-1]).split(", ")[1])

In [60]:
df_2021['actor_2_name'] = df_2021['Cast and crew'].map(lambda x: get_actor2(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [62]:
df_2021['actor_2_name']

0      Taylor John Smith
1          Rajkummar Rao
2       Chiwetel Ejiofor
3          Ralph Fiennes
4           Damson Idris
             ...        
352    Frances McDormand
353         Chanté Adams
354          Anna Paquin
355           Elkin Díaz
356                  NaN
Name: actor_2_name, Length: 357, dtype: object

In [63]:
df['Cast and crew'].head(1)[0]

'Roseanne Liang (director/screenplay); Max Landis (screenplay); Chloë Grace Moretz, Taylor John Smith, Beulah Koale, Callan Mulvey, Nick Robinson'

In [64]:
df['Cast and crew'][355]

'Apichatpong Weerasethakul (director/acreenplay); Tilda Swinton, Elkin Díaz, Jeanne Balibar, Juan Pablo Urrego, Daniel Giménez Cacho'

In [65]:
def get_actor3(x):
  if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
    return np.NaN
  elif "screenplay); " in x:
    return ((x.split("screenplay); ")[-1]).split(", ")[2])
  else:
    return ((x.split("acreenplay); ")[-1]).split(", ")[2])

In [66]:
df_2021['actor_3_name'] = df_2021['Cast and crew'].map(lambda x: get_actor3(str(x)))

In [67]:
df_2021

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Drama,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham
...,...,...,...,...,...,...,...
352,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...,Drama War,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel
353,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...,Drama Romance,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian
354,American Underdog,"Erwin brothers (directors); Jon Erwin, David A...",Drama,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid
355,Memoria,Apichatpong Weerasethakul (director/acreenplay...,Drama Fantasy Mystery,Apichatpong Weerasethakul (director/acreenplay...,Tilda Swinton,Elkin Díaz,Jeanne Balibar


In [68]:
df_2021 = df_2021.rename(columns={'Title':'movie_title'})

In [69]:
new_df21 = df_2021.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [70]:
new_df21

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,Shadow in the Cloud
1,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Drama,The White Tiger
2,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Drama,Locked Down
3,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Drama History,The Dig
4,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Thriller Action Science Fiction,Outside the Wire
...,...,...,...,...,...,...
352,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Drama War,The Tragedy of Macbeth
353,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Drama Romance,A Journal for Jordan
354,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Drama,American Underdog
355,Apichatpong Weerasethakul (director/acreenplay...,Tilda Swinton,Elkin Díaz,Jeanne Balibar,Drama Fantasy Mystery,Memoria


In [71]:
new_df21['comb'] = new_df21['actor_1_name'] + ' ' + new_df21['actor_2_name'] + ' ' + new_df21['actor_3_name'] + ' ' + new_df21['director_name'] + ' ' + new_df21['genres']

In [82]:
new_df21.isnull().sum()

director_name     0
actor_1_name      0
actor_2_name      9
actor_3_name     26
genres            1
movie_title       1
comb             27
dtype: int64

In [83]:
new_df21 = new_df21.dropna(how='any')

In [84]:
new_df21.isnull().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [85]:
new_df21['movie_title'] = new_df21['movie_title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
new_df21

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,shadow in the cloud,Chloë Grace Moretz Taylor John Smith Beulah Ko...
1,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Drama,the white tiger,Adarsh Gourav Rajkummar Rao Priyanka Chopra Jo...
2,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Drama,locked down,Anne Hathaway Chiwetel Ejiofor Stephen Merchan...
3,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Drama History,the dig,Carey Mulligan Ralph Fiennes Lily James Simon ...
4,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Thriller Action Science Fiction,outside the wire,Anthony Mackie Damson Idris Emily Beecham Mika...
...,...,...,...,...,...,...,...
351,Matthew Vaughn,Ralph Fiennes,Gemma Arterton,Rhys Ifans,Action Adventure Thriller War,the king's man,Ralph Fiennes Gemma Arterton Rhys Ifans Matthe...
352,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Drama War,the tragedy of macbeth,Denzel Washington Frances McDormand Bertie Car...
353,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Drama Romance,a journal for jordan,Michael B. Jordan Chanté Adams Jalon Christian...
354,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Drama,american underdog,Zachary Levi Anna Paquin Dennis Quaid Erwin br...


In [87]:
old_df = pd.read_csv('main_data.csv')

In [88]:
old_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
6117,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton,Animation Comedy Fantasy Family,soul,Jamie Foxx Tina Fey Graham Norton Pete Docter ...
6118,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Action Fantasy Family Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
6119,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
6120,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


In [89]:
#ignore_index : bool, default False
#If True, the resulting axis will be labeled 0, 1, …, n - 1.

final_df = old_df.append(new_df21, ignore_index=True)

In [90]:
final_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
6447,Matthew Vaughn,Ralph Fiennes,Gemma Arterton,Rhys Ifans,Action Adventure Thriller War,the king's man,Ralph Fiennes Gemma Arterton Rhys Ifans Matthe...
6448,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Drama War,the tragedy of macbeth,Denzel Washington Frances McDormand Bertie Car...
6449,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Drama Romance,a journal for jordan,Michael B. Jordan Chanté Adams Jalon Christian...
6450,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Drama,american underdog,Zachary Levi Anna Paquin Dennis Quaid Erwin br...


In [91]:
final_df.to_csv('main_data_till21.csv', index=False)