## Extracting the movies of 2020

In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"

source = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(source,'lxml')

tables = soup.find_all('table',class_='wikitable sortable')

len(tables)

4

In [3]:
type(tables[0])

bs4.element.Tag

In [4]:
df1 = pd.read_html(str(tables[0]))[0]
df2 = pd.read_html(str(tables[1]))[0]
df3 = pd.read_html(str(tables[2]))[0]
df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'

In [5]:
df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,JANUARY,3,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2]
1,JANUARY,10,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3]
2,JANUARY,10,Like a Boss,Paramount Pictures,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4]
3,JANUARY,10,Three Christs,IFC Films,Jon Avnet (director/screenplay); Eric Nazarian...,
4,JANUARY,10,Inherit the Viper,Barry Films / Tycor International Film Company,Anthony Jerjen (director); Andrew Crabtree (sc...,[5]


In [6]:
df_20 = df[['Title','Cast and crew']]
df_20.head()

Unnamed: 0,Title,Cast and crew
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...
1,Underwater,"William Eubank (director); Brian Duffield, Ada..."
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col..."
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...


In [7]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = '31370eb33aa123089b518141fd2f4347'

In [9]:
from tmdbv3api import Movie
tmdb_movie = Movie() 
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)
    if not result:
      return np.NaN
    else:
      movie_id = result[0].id
      response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
      data_json = response.json()
      if data_json['genres']:
          genre_str = " " 
          for i in range(0,len(data_json['genres'])):
              genres.append(data_json['genres'][i]['name'])
          return genre_str.join(genres)
      else:
          return np.NaN

In [10]:
df_20['genres'] = df_20['Title'].map(lambda x: get_genre(str(x)))
df.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20['genres'] = df_20['Title'].map(lambda x: get_genre(str(x)))


Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
267,DECEMBER,25,We Can Be Heroes,Netflix / Troublemaker Studios,Robert Rodriguez (director/screenplay); Priyan...,[239]
268,DECEMBER,25,News of the World,Universal Pictures / Perfect World Pictures,Paul Greengrass (director/screenplay); Luke Da...,[240]
269,DECEMBER,25,One Night in Miami...,Amazon Studios,Regina King (director); Kemp Powers (screenpla...,[241]
270,DECEMBER,25,Promising Young Woman,Focus Features / FilmNation Entertainment,Emerald Fennell (director/screenplay); Carey M...,[242]
271,DECEMBER,30,Pieces of a Woman,Netflix / Bron Studios,Kornél Mundruczó (director); Kata Wéber (scree...,[243]


In [11]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [12]:
df_20['director_name'] = df_20['Cast and crew'].map(lambda x: get_director(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20['director_name'] = df_20['Cast and crew'].map(lambda x: get_director(str(x)))


In [13]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [14]:
df_20['actor_1_name'] = df_20['Cast and crew'].map(lambda x: get_actor1(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20['actor_1_name'] = df_20['Cast and crew'].map(lambda x: get_actor1(str(x)))


In [15]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [16]:
df_20['actor_2_name'] = df_20['Cast and crew'].map(lambda x: get_actor2(str(x)))

In [17]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [18]:
df_20['actor_3_name'] = df_20['Cast and crew'].map(lambda x: get_actor3(str(x)))

In [19]:
df_20.tail()

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
267,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...,Action Fantasy Family Comedy,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin
268,News of the World,Paul Greengrass (director/screenplay); Luke Da...,Action Adventure Drama Western,Paul Greengrass,Tom Hanks,Helena Zengel,
269,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...,Drama,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge
270,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,Thriller Crime Drama,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie
271,Pieces of a Woman,Kornél Mundruczó (director); Kata Wéber (scree...,Drama,Kornél Mundruczó,Vanessa Kirby,Shia LaBeouf,Molly Parker


In [20]:
df_20 = df_20.rename(columns={'Title':'movie_title'})
df_20.head()

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Drama Thriller Crime,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs


In [21]:
df_20['comb'] = df_20['actor_1_name'] + ' ' + df_20['actor_2_name'] + ' '+ df_20['actor_3_name'] + ' '+ df_20['director_name'] +' ' + df_20['genres']

## Extracting the movies of 2021

In [22]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"

source = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(source,'lxml')

tables = soup.find_all('table',class_='wikitable sortable')

len(tables)

4

In [23]:
df1 = pd.read_html(str(tables[0]))[0]
df2 = pd.read_html(str(tables[1]))[0]
df3 = pd.read_html(str(tables[2]))[0]
df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'

In [24]:
df_21 = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)
df_21.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref,Ref.
0,JANUARY,1.0,Shadow in the Cloud,Vertical Entertainment,Roseanne Liang (director/screenplay); Max Land...,[2],
1,JANUARY,13.0,The White Tiger,Netflix,Ramin Bahrani (director/screenplay); Adarsh Go...,,
2,JANUARY,14.0,Locked Down,HBO Max / Warner Bros. Pictures,Doug Liman (director); Steven Knight (screenpl...,[3],
3,JANUARY,15.0,The Dig,Netflix / Clerkenwell Films,Simon Stone (director); Moira Buffini (screenp...,[4],
4,JANUARY,15.0,Outside the Wire,Netflix,"Mikael Håfström (director); Rob Yescombe, Rowa...",[5],


In [25]:
df_21 = df_21[['Title','Cast and crew']]
df_21.head()

Unnamed: 0,Title,Cast and crew
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...
3,The Dig,Simon Stone (director); Moira Buffini (screenp...
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa..."


In [26]:
df_21['genres'] = df_21['Title'].map(lambda x: get_genre(str(x)))
df_21.tail()

Unnamed: 0,Title,Cast and crew,genres
223,The Matrix 4,Lana Wachowski (director/screenplay); Aleksand...,Action Science Fiction
224,Sing 2,Garth Jennings (director/screenplay); Matthew ...,Music Animation Comedy Family
225,The King's Man,Matthew Vaughn (director/screenplay); Karl Gaj...,Action Adventure Comedy
226,Downton Abbey 2,Simon Curtis (director); Julian Fellowes (scre...,Drama
227,Cyrano,Joe Wright (director); Erica Schmidt (screenpl...,Drama Comedy History Romance


In [27]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [28]:
df_21['director_name'] = df_21['Cast and crew'].map(lambda x: get_director(str(x)))

In [29]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [32]:
df_21['actor_1_name'] = df_21['Cast and crew'].map(lambda x: get_actor1(str(x)))

In [31]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [33]:
df_21['actor_2_name'] = df_21['Cast and crew'].map(lambda x: get_actor2(str(x)))

In [34]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [35]:
df_21['actor_3_name'] = df_21['Cast and crew'].map(lambda x: get_actor3(str(x)))

In [36]:
df_21 = df_21.rename(columns={'Title':'movie_title'})
df_21.tail(100)

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
128,Fatherhood,Paul Weitz (director/screenplay); Dana Stevens...,Drama Family Comedy,Paul Weitz,Kevin Hart,Alfre Woodard,Lil Rel Howery
129,The Birthday Cake,Jimmy Giannopoulos (director/screenplay); Diom...,Crime Thriller,Jimmy Giannopoulos,Shiloh Fernandez,Val Kilmer,Ewan McGregor
130,Rita Moreno: Just a Girl Who Decided to Go for It,Mariem Pérez Riera (director/screenplay); Rita...,Documentary,Mariem Pérez Riera,Rita Moreno,Eva Longoria,George Chakiris
131,Truman & Tennessee: An Intimate Conversation,Lisa Immordino Vreeland (director/screenplay);...,Documentary,Lisa Immordino Vreeland,Jim Parsons,Zachary Quinto,
132,A Crime on the Bayou,Nancy Buirski (director/screenplay); Gary Dunc...,Documentary,Nancy Buirski,Gary Duncan,Richard Sobol,
...,...,...,...,...,...,...,...
223,The Matrix 4,Lana Wachowski (director/screenplay); Aleksand...,Action Science Fiction,Lana Wachowski,Keanu Reeves,Carrie-Anne Moss,Yahya Abdul-Mateen II
224,Sing 2,Garth Jennings (director/screenplay); Matthew ...,Music Animation Comedy Family,Garth Jennings,Matthew McConaughey,Reese Witherspoon,Scarlett Johansson
225,The King's Man,Matthew Vaughn (director/screenplay); Karl Gaj...,Action Adventure Comedy,Matthew Vaughn,Ralph Fiennes,Gemma Arterton,Rhys Ifans
226,Downton Abbey 2,Simon Curtis (director); Julian Fellowes (scre...,Drama,Simon Curtis,Hugh Bonneville,Jim Carter,Michelle Dockery


In [37]:
df_21['comb'] = df_21['actor_1_name'] + ' ' + df_21['actor_2_name'] + ' '+ df_21['actor_3_name'] + ' '+ df_21['director_name'] +' ' + df_21['genres']
df_21.head()

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name,comb
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Chloë Grace Moretz Taylor John Smith Beulah Ko...
1,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Crime Drama,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Adarsh Gourav Rajkummar Rao Priyanka Chopra Jo...
2,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Drama,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Anne Hathaway Chiwetel Ejiofor Stephen Merchan...
3,The Dig,Simon Stone (director); Moira Buffini (screenp...,Drama History,Simon Stone,Carey Mulligan,Ralph Fiennes,Lily James,Carey Mulligan Ralph Fiennes Lily James Simon ...
4,Outside the Wire,"Mikael Håfström (director); Rob Yescombe, Rowa...",Thriller Action Science Fiction,Mikael Håfström,Anthony Mackie,Damson Idris,Emily Beecham,Anthony Mackie Damson Idris Emily Beecham Mika...


In [38]:
new_df = df_20.append(df_21,ignore_index=True)
new_df.head()

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name,comb
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,Horror Mystery Thriller,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,Andrea Riseborough Demián Bichir John Cho Nico...
1,Underwater,"William Eubank (director); Brian Duffield, Ada...",Action Horror Science Fiction Thriller,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,Kristen Stewart Vincent Cassel Jessica Henwick...
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",Comedy,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,Tiffany Haddish Rose Byrne Salma Hayek Miguel ...
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,Drama,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins,Richard Gere Peter Dinklage Walton Goggins Jon...
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,Drama Thriller Crime,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,Josh Hartnett Margarita Levieva Chandler Riggs...


In [39]:
old_df = pd.read_csv('C:/Users/SANTHOSH/Desktop/practise/flask/movie_recommandation/datasets/final_data.csv')
old_df.tail(100)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
5757,André Øvredal,Zoe Colletti,Michael Garza,Gabriel Rush,Horror Thriller,scary stories to tell in the dark,Zoe Colletti Michael Garza Gabriel Rush André ...
5758,"Tyler Nilson, Michael Schwartz",Shia LaBeouf,Dakota Johnson,John Hawkes,Adventure Drama Comedy,the peanut butter falcon,Shia LaBeouf Dakota Johnson John Hawkes Tyler ...
5759,Bart Freundlich,Michelle Williams,Julianne Moore,Billy Crudup,Drama,after the wedding,Michelle Williams Julianne Moore Billy Crudup ...
5760,Casey Affleck,Casey Affleck,Anna Pniowsky,Tom Bower,Science Fiction Drama,light of my life,Casey Affleck Anna Pniowsky Tom Bower Casey Af...
5761,Thurop Van Orman,Jason Sudeikis,Josh Gad,Leslie Jones,Animation Comedy Adventure Family,the angry birds movie 2,Jason Sudeikis Josh Gad Leslie Jones Thurop Va...
...,...,...,...,...,...,...,...
5852,"Nick Bruno, Troy Quane",Will Smith,Tom Holland,Rashida Jones,Animation Action Adventure Comedy Family,spies in disguise,Will Smith Tom Holland Rashida Jones Nick Brun...
5853,Greta Gerwig,Saoirse Ronan,Emma Watson,Florence Pugh,Drama Romance,little women,Saoirse Ronan Emma Watson Florence Pugh Greta ...
5854,Sam Mendes,George MacKay,Dean-Charles Chapman,Mark Strong,War Drama Action Thriller,1917,George MacKay Dean-Charles Chapman Mark Strong...
5855,Destin Daniel Cretton,Michael B. Jordan,Jamie Foxx,Brie Larson,Drama Crime History,just mercy,Michael B. Jordan Jamie Foxx Brie Larson Desti...


In [40]:
final_df = old_df.append(new_df,ignore_index=True)
final_df.head(200)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb,Cast and crew
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...,
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...,
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...,
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...,
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...,
...,...,...,...,...,...,...,...,...
195,Tony Gilroy,Jeremy Renner,Scott Glenn,Stacy Keach,Action Adventure Thriller,the bourne legacy,Jeremy Renner Scott Glenn Stacy Keach Tony Gil...,
196,Joel Schumacher,Michael Gough,Vivica A. Fox,John Glover,Action,batman & robin,Michael Gough Vivica A. Fox John Glover Joel S...,
197,Ron Howard,Clint Howard,T.J. Thyne,Molly Shannon,Comedy Family Fantasy,how the grinch stole christmas,Clint Howard T.J. Thyne Molly Shannon Ron Howa...,
198,Roland Emmerich,Jake Gyllenhaal,Dennis Quaid,Sela Ward,Action Adventure Sci-Fi Thriller,the day after tomorrow,Jake Gyllenhaal Dennis Quaid Sela Ward Roland ...,


In [41]:
final_df.isna().sum()

director_name       0
actor_1_name        0
actor_2_name        8
actor_3_name       41
genres              3
movie_title         2
comb               42
Cast and crew    5859
dtype: int64

In [42]:
final_df = final_df.dropna(how='any')

In [43]:
final_df.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
Cast and crew    0
dtype: int64

In [44]:
final_df.to_csv('C:/Users/SANTHOSH/Desktop/practise/flask/movie_recommandation/datasets/finalz_data.csv',index=False)