In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Extracting features of 2023 movies from Wikipedia

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2023"

In [3]:
source = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(source,'lxml')

In [4]:
tables = soup.find_all('table',class_='wikitable sortable')

In [5]:
len(tables)

4

In [6]:
df1 = pd.read_html(str(tables[0]))[0]
df2 = pd.read_html(str(tables[1]))[0]
df3 = pd.read_html(str(tables[2]))[0]
df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'

  df1 = pd.read_html(str(tables[0]))[0]
  df2 = pd.read_html(str(tables[1]))[0]
  df3 = pd.read_html(str(tables[2]))[0]
  df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'


In [7]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [8]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,6,M3GAN,Universal Pictures / Blumhouse Productions / A...,Gerard Johnstone (director); Akela Cooper (scr...,[3]
1,J A N U A R Y,6,The Old Way,Saban Films / Saturn Films,Brett Donowho (director); Carl W. Lucas (scree...,[4]
2,J A N U A R Y,11,The Devil Conspiracy,Samuel Goldwyn Films,Nathan Frankowski (director); Ed Alan (screenp...,[5]
3,J A N U A R Y,13,Plane,Lionsgate / MadRiver Pictures / Di Bonaventura...,Jean-François Richet (director); Charles Cummi...,[6]
4,J A N U A R Y,13,House Party,Warner Bros. Pictures / New Line Cinema,"Calmatic (director); Jamal Olori, Stephen Glov...",[7]
...,...,...,...,...,...,...
337,D E C E M B E R,22,Memory,Ketchup Entertainment / Mubi,Michel Franco (director/screenplay); Jessica C...,[324]
338,D E C E M B E R,25,The Color Purple,Warner Bros. Pictures / Amblin Entertainment /...,"Blitz Bazawule (director), Marcus Gardley (scr...",[325]
339,D E C E M B E R,25,The Boys in the Boat,Metro-Goldwyn-Mayer / Smokehouse Pictures,"George Clooney (director), Mark L. Smith (scre...",[326]
340,D E C E M B E R,25,Ferrari,Neon / STXfilms / Ketchup Entertainment,"Michael Mann (director), Troy Kennedy Martin (...",[327]


In [9]:
df_2023 = df[['Title','Cast and crew']]

In [10]:
df_2023

Unnamed: 0,Title,Cast and crew
0,M3GAN,Gerard Johnstone (director); Akela Cooper (scr...
1,The Old Way,Brett Donowho (director); Carl W. Lucas (scree...
2,The Devil Conspiracy,Nathan Frankowski (director); Ed Alan (screenp...
3,Plane,Jean-François Richet (director); Charles Cummi...
4,House Party,"Calmatic (director); Jamal Olori, Stephen Glov..."
...,...,...
337,Memory,Michel Franco (director/screenplay); Jessica C...
338,The Color Purple,"Blitz Bazawule (director), Marcus Gardley (scr..."
339,The Boys in the Boat,"George Clooney (director), Mark L. Smith (scre..."
340,Ferrari,"Michael Mann (director), Troy Kennedy Martin (..."


In [11]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = 'YOUR_API_KEY'

In [12]:
from tmdbv3api import Movie
tmdb_movie = Movie() 
def get_genre(x):
    try:
        genres = []
        result = tmdb_movie.search(x)
        if not result:
            return np.NaN
        else:
            movie_id = result[0].id
            response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, tmdb.api_key))
            data_json = response.json()
            if data_json['genres']:
                genre_str = " " 
                for i in range(0, len(data_json['genres'])):
                    genres.append(data_json['genres'][i]['name'])
                return genre_str.join(genres)
            else:
                return np.NaN
    except Exception as e:
        return np.NaN


In [13]:
df_2023['genres'] =df_2023['Title'].map(lambda x: get_genre(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['genres'] =df_2023['Title'].map(lambda x: get_genre(str(x)))


In [14]:
df_2023

Unnamed: 0,Title,Cast and crew,genres
0,M3GAN,Gerard Johnstone (director); Akela Cooper (scr...,Science Fiction Horror
1,The Old Way,Brett Donowho (director); Carl W. Lucas (scree...,Western Drama
2,The Devil Conspiracy,Nathan Frankowski (director); Ed Alan (screenp...,Horror Fantasy Science Fiction Thriller
3,Plane,Jean-François Richet (director); Charles Cummi...,Action
4,House Party,"Calmatic (director); Jamal Olori, Stephen Glov...",Comedy
...,...,...,...
337,Memory,Michel Franco (director/screenplay); Jessica C...,Action Thriller Crime
338,The Color Purple,"Blitz Bazawule (director), Marcus Gardley (scr...",Drama
339,The Boys in the Boat,"George Clooney (director), Mark L. Smith (scre...",Drama History
340,Ferrari,"Michael Mann (director), Troy Kennedy Martin (...",History Drama


In [15]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [16]:
df_2023['director_name'] = df_2023['Cast and crew'].map(lambda x: get_director(str(x)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['director_name'] = df_2023['Cast and crew'].map(lambda x: get_director(str(x)))


In [17]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [18]:
df_2023['actor_1_name'] = df_2023['Cast and crew'].map(lambda x: get_actor1(str(x)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['actor_1_name'] = df_2023['Cast and crew'].map(lambda x: get_actor1(str(x)))


In [19]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [20]:
df_2023['actor_2_name'] = df_2023['Cast and crew'].map(lambda x: get_actor2(str(x)))


In [21]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [22]:
df_2023['actor_3_name'] = df_2023['Cast and crew'].map(lambda x: get_actor3(str(x)))


In [23]:
df_2023

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,M3GAN,Gerard Johnstone (director); Akela Cooper (scr...,Science Fiction Horror,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald
1,The Old Way,Brett Donowho (director); Carl W. Lucas (scree...,Western Drama,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,
2,The Devil Conspiracy,Nathan Frankowski (director); Ed Alan (screenp...,Horror Fantasy Science Fiction Thriller,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall
3,Plane,Jean-François Richet (director); Charles Cummi...,Action,Jean-François Richet,Gerard Butler,Mike Colter,Yoson An
4,House Party,"Calmatic (director); Jamal Olori, Stephen Glov...",Comedy,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom
...,...,...,...,...,...,...,...
337,Memory,Michel Franco (director/screenplay); Jessica C...,Action Thriller Crime,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever
338,The Color Purple,"Blitz Bazawule (director), Marcus Gardley (scr...",Drama,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks
339,The Boys in the Boat,"George Clooney (director), Mark L. Smith (scre...",Drama History,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness
340,Ferrari,"Michael Mann (director), Troy Kennedy Martin (...",History Drama,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley


In [24]:
df_2023 = df_2023.rename(columns={'Title':'movie_title'})

In [25]:
new_df23 = df_2023.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [26]:
new_df23

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald,Science Fiction Horror,M3GAN
1,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,,Western Drama,The Old Way
2,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall,Horror Fantasy Science Fiction Thriller,The Devil Conspiracy
3,Jean-François Richet,Gerard Butler,Mike Colter,Yoson An,Action,Plane
4,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom,Comedy,House Party
...,...,...,...,...,...,...
337,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,Action Thriller Crime,Memory
338,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,Drama,The Color Purple
339,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,Drama History,The Boys in the Boat
340,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,History Drama,Ferrari


In [27]:
new_df23['comb'] = new_df23['actor_1_name'] + ' ' + new_df23['actor_2_name'] + ' '+ new_df23['actor_3_name'] + ' '+ new_df23['director_name'] +' ' + new_df23['genres']

In [28]:
new_df23.isna().sum()

director_name     0
actor_1_name      0
actor_2_name      3
actor_3_name     20
genres            0
movie_title       0
comb             20
dtype: int64

In [29]:
new_df23 = new_df23.dropna(how='any')

In [30]:
new_df23.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [31]:
new_df23['movie_title'] = new_df23['movie_title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df23['movie_title'] = new_df23['movie_title'].str.lower()


In [32]:
new_df23

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald,Science Fiction Horror,m3gan,Allison Williams Violet McGraw Amie Donald Ger...
2,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall,Horror Fantasy Science Fiction Thriller,the devil conspiracy,Alice Orr-Ewing Joe Doyle Eveline Hall Nathan ...
3,Jean-François Richet,Gerard Butler,Mike Colter,Yoson An,Action,plane,Gerard Butler Mike Colter Yoson An Jean-Franço...
4,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom,Comedy,house party,Tosin Cole Jacob Latimore Karen Obilom Calmati...
5,John Hyams,Gideon Adlon,Bethlehem Million,Marc Menchaca,Horror Thriller Mystery,sick,Gideon Adlon Bethlehem Million Marc Menchaca J...
...,...,...,...,...,...,...,...
337,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,Action Thriller Crime,memory,Jessica Chastain Peter Sarsgaard Merritt Wever...
338,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,Drama,the color purple,Fantasia Barrino Taraji P. Henson Danielle Bro...
339,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,Drama History,the boys in the boat,Callum Turner Joel Edgerton Peter Guinness Geo...
340,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,History Drama,ferrari,Adam Driver Penelope Cruz Shailene Woodley Mic...


In [33]:
old_df = pd.read_csv('2022_and_prior.csv')

In [34]:
old_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
6709,Kasi Lemmons,Naomi Ackie,Stanley Tucci,Ashton Sanders,Music History Drama,whitney houston: i wanna dance with somebody,Naomi Ackie Stanley Tucci Ashton Sanders Kasi ...
6710,Scott Cooper,Christian Bale,Harry Melling,Gillian Anderson,Thriller Crime Horror Mystery,the pale blue eye,Christian Bale Harry Melling Gillian Anderson ...
6711,Sarah Polley,Rooney Mara,Claire Foy,Jessie Buckley,Drama Crime,women talking,Rooney Mara Claire Foy Jessie Buckley Sarah Po...
6712,Marc Forster,Tom Hanks,Mariana Treviño,Rachel Keller,Comedy Drama,a man called otto,Tom Hanks Mariana Treviño Rachel Keller Marc F...


In [35]:
final_df = pd.concat([old_df, new_df23], ignore_index=True)

In [36]:
final_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
7031,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever,Action Thriller Crime,memory,Jessica Chastain Peter Sarsgaard Merritt Wever...
7032,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks,Drama,the color purple,Fantasia Barrino Taraji P. Henson Danielle Bro...
7033,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness,Drama History,the boys in the boat,Callum Turner Joel Edgerton Peter Guinness Geo...
7034,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley,History Drama,ferrari,Adam Driver Penelope Cruz Shailene Woodley Mic...


In [37]:
final_df.to_csv('2023_and_prior.csv',index=False)