#### *Mounting Drive*

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Data Cleaning**

---



In [6]:
import pandas as pd
import numpy as np

In [None]:
data  = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train_tmdb.csv")

In [None]:
data.head(3)

Unnamed: 0,id,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,...,release_date,runtime,spoken_languages,status,tagline,title,keywords,cast,crew,revenue
0,653346,160000000,"Science Fiction, Adventure, Action",https://www.20thcenturystudios.com/movies/king...,tt11389872,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,6245.898,/gKkl37BQuKTanygYQG1pyYgLVgf.jpg,...,2024-05-08,145,English,Released,No one can stop the reign.,Kingdom of the Planet of the Apes,"empire, kingdom, gorilla, dystopia, eagle, seq...","Owen Teague, Freya Allan, Kevin Durand, Peter ...","Wes Ball, Joe Hartwick Jr., Peter Chernin, Ric...",359039904
1,929590,50000000,"War, Action, Drama",https://a24films.com/films/civil-war,tt17279496,en,Civil War,"In the near future, a group of war journalists...",2730.901,/sh7Rg8Er3tFcN9BpKIPOMvALgZd.jpg,...,2024-04-10,109,English,Released,Welcome to the frontline.,Civil War,"sniper, new york city, race against time, wash...","Kirsten Dunst, Wagner Moura, Cailee Spaeny, St...","Andrew Macdonald, Allon Reich, Gregory Goodman...",114097977
2,823464,150000000,"Science Fiction, Action, Adventure",https://www.godzillaxkongmovie.com,tt14539740,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",2726.153,/z1p34vh7dEOnLDmyCrlUVLuoDzd.jpg,...,2024-03-27,115,English,Released,Rise together or fall alone.,Godzilla x Kong: The New Empire,"giant monster, sequel, dinosaur, monkey, kaiju...","Rebecca Hall, Brian Tyree Henry, Dan Stevens, ...","Adam Wingard, Thomas Tull, Jon Jashni, Mary Pa...",567156493


## Rename

**homepage -> Movie_url**

**original_title -> movie_title**

In [None]:
data.rename(columns= {'homepage':'movie_url', 'original_title':'movie_title'}, inplace= True)

## Find the duplicates in data

In [None]:
data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
4002     True
4003     True
4004     True
4005    False
4006     True
Length: 4007, dtype: bool

In [None]:
# checking the data_duplicates

print(f' # duplicates exist in data : {sum(data.duplicated())}')

 # duplicates exist in data : 34


drop duplicates

In [None]:
data.drop_duplicates(inplace= True)

In [None]:
# verify
sum(data.duplicated())

0

In [None]:
data.describe()

Unnamed: 0,id,budget,popularity,runtime,revenue
count,3973.0,3973.0,3973.0,3973.0,3973.0
mean,168098.6,43185730.0,57.799893,110.02366,132965500.0
std,245984.8,51166840.0,147.626042,21.431616,218213600.0
min,5.0,0.0,16.329,0.0,0.0
25%,8909.0,9000000.0,25.94,95.0,11600000.0
50%,19724.0,25000000.0,36.333,107.0,53300000.0
75%,301351.0,60000000.0,54.786,121.0,159814500.0
max,1260040.0,460000000.0,6245.898,248.0,2923706000.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3973 entries, 0 to 4005
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3973 non-null   int64  
 1   budget                3973 non-null   int64  
 2   genres                3971 non-null   object 
 3   movie_url             2026 non-null   object 
 4   imdb_id               3973 non-null   object 
 5   original_language     3973 non-null   object 
 6   movie_title           3972 non-null   object 
 7   overview              3973 non-null   object 
 8   popularity            3973 non-null   float64
 9   poster_path           3973 non-null   object 
 10  production_companies  3965 non-null   object 
 11  production_countries  3972 non-null   object 
 12  release_date          3973 non-null   object 
 13  runtime               3973 non-null   int64  
 14  spoken_languages      3971 non-null   object 
 15  status                3973

### Use the API calls to replacing the missing values in data

In [None]:
async def main(movie_id,column):
    async with s.aiohttp.ClientSession() as Session:
        # fetching the tmdb movie page

        # fetching the movie details
        movie_details = [s.fetch_movie_details(Session, i) for i in movie_id]

        details_response = await s.asyncio.gather(*movie_details)

        # fetching keywords
        task = [s.keywords(Session,i) for i in movie_id]

        key_response = await s.asyncio.gather(*task)
        # fetching the credits
        credit_task = [s.credits(Session,i) for i in movie_id]

        credit_response = await s.asyncio.gather(*credit_task)

        # list for storing the row
        details_list = []

        for response, key, credit, movie_id in zip(details_response,key_response, credit_response,movie_id):
            if response['budget'] > 0:
                top_cast = credit.get('cast', [])[:5]
                top_crew = credit.get('crew', [])[:5]

                try:
                    new_rows = {
                        'id': movie_id,
                        'imdb_id': response.get('imdb_id'),
                        'title': response.get('title'),
                        'original_language': response.get('original_language'),
                        'movie_title': response.get('original_title'),
                        'origin_country': ', '.join(response.get('origin_country')),
                        'genres': ', '.join([genre.get('name') for genre in response.get('genres')]),
                        'production_companies': ', '.join([company.get('name') for company in response.get('production_companies')]),
                        'production_countries': ', '.join([country.get('name') for country in response.get('production_countries')]),
                        'runtime_in_mins': response.get('runtime'),
                        'spoken_languages': ', '.join([lang.get('name') for lang in response.get('spoken_languages')]),
                        'status': response.get('status'),
                        'tagline': response.get('tagline'),
                        'movie_url': response.get('homepage'),
                        'popularity': response.get('popularity'),
                        'release_date': response.get('release_date'),
                        'overview': response.get('overview'),
                        'vote_count': response.get('vote_count'),
                        'vote_average': response.get('vote_average'),
                        'keywords': ", ".join([keyword.get('name') for keyword in key.get('keywords')]),
                        'cast': ", ".join([cast.get("name") for cast in top_cast]),
                        'crew': ", ".join([crew.get("name") for crew in top_crew]),
                        'revenue': response.get('revenue')
                    }
                    # appending the rows to details_list
                    details_list.append(new_rows)
                except Exception as e:
                    s.logging.error(f'error in proccesing movie_id {movie_id};  {s.traceback.format_exc()}"')
        # creating the DataFrame using the details_list

        return ''.join([movie[column] for movie in details_list])


Categorical and numerical columns


In [None]:
col = data.columns.to_list()
col

['id',
 'budget',
 'genres',
 'movie_url',
 'imdb_id',
 'original_language',
 'movie_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'keywords',
 'cast',
 'crew',
 'revenue']

In [None]:
cols = data[data[col].isnull().any(axis=1)]
cols

Unnamed: 0,id,budget,genres,movie_url,imdb_id,original_language,movie_title,overview,popularity,poster_path,...,release_date,runtime,spoken_languages,status,tagline,title,keywords,cast,crew,revenue
8,1011985,80000000,"Animation, Action, Family, Comedy, Fantasy",https://www.dreamworks.com/movies/kung-fu-panda-4,tt21692408,en,Kung Fu Panda 4,Po is gearing up to become the spiritual leade...,1061.125,/kDp1vUBnMpe8ak4rjgl3cLELqjU.jpg,...,2024-03-02,94,English,Released,,Kung Fu Panda 4,"martial arts, kung fu, china, sequel, panda, a...","Jack Black, Awkwafina, Viola Davis, Dustin Hof...","Natalia Cronembold, Stephanie Ma Stine, Rebecc...",539271347
10,882059,18000000,,https://www.boykillsworldmovie.com,tt13923084,en,Boy Kills World,"When his family is murdered, a deaf-mute named...",934.001,/25JskXmchcYwj3jHRmcPm738MpB.jpg,...,2024-04-24,110,English,Released,,Boy Kills World,"resistance, dystopia, deaf-mute, gore, halluci...","Bill Skarsgård, Jessica Rothe, Michelle Docker...","Moritz Mohr, Sam Raimi, Roy Lee, Zainab Azizi,...",3128540
19,385687,340000000,,https://fastxmovie.com,tt5433140,en,Fast X,Over many missions and against impossible odds...,420.774,/fiVW06jE7z9YnO4trhaMEdclSiC.jpg,...,2023-05-17,142,English,Released,The end of the road begins.,Fast X,"sequel, revenge, betrayal, racing, family, cli...","Vin Diesel, Michelle Rodriguez, Tyrese Gibson,...","Gary Scott Thompson, Stephen F. Windon, Neal H...",704709660
29,934632,83000000,"Science Fiction, Action, Drama",https://www.netflix.com/title/81624666,tt23137904,en,Rebel Moon - Part Two: The Scargiver,The rebels gear up for battle against the ruth...,413.025,/cxevDYdeFkiixRShbObdwAHBZry.jpg,...,2024-04-19,123,English,Released,,Rebel Moon - Part Two: The Scargiver,"space war, space, female protagonist, space op...","Sofia Boutella, Michiel Huisman, Ed Skrein, Dj...","Zack Snyder, Dody Dorn, Zack Snyder, Zack Snyd...",0
51,475557,55000000,"Crime, Thriller, Drama",http://www.jokermovie.net/,tt7286456,en,,"During the 1980s, a failed stand-up comedian i...",228.080,/udDclJoHjfjb8Ekgsd4FDteOkCU.jpg,...,2019-10-01,122,English,Released,Put on a happy face.,Joker,"dream, street gang, society, psychopath, clown...","Joaquin Phoenix, Robert De Niro, Zazie Beetz, ...","Todd Phillips, Bradley Cooper, Emma Tillinger ...",1078958629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3993,693827,5000000,"Crime, Drama, Thriller",,tt5078852,en,Lansky,When the aging Meyer Lansky is investigated on...,33.843,/weNMzi7WXuzoml2Wvm1VMEE7Ipq.jpg,...,2021-06-24,119,"עִבְרִית, English",Released,The man who turned crime into a billion dollar...,Lansky,"gangster, crime boss, mafia boss, organized cr...","Harvey Keitel, Sam Worthington, John Magaro, M...","Peter Flinckenberg, Laura Belle, Eytan Rockawa...",136579
3994,13920,35000000,Drama,,tt0316465,en,Radio,"High school football coach, Harold Jones befri...",23.826,/uQ6ci4iFHhB6TWB2f4wftR7AEly.jpg,...,2003-10-24,109,"English, Pусский",Released,His courage made them champions.,Radio,"friendship, sports, biography","Cuba Gooding Jr., Ed Harris, Alfre Woodard, S....","Mike Rich, Michael Tollin, Herb Gains, Clay A....",53293628
3995,10060,40000000,"Drama, Crime, Music",,tt0430308,en,Get Rich or Die Tryin',A tale of an inner city drug dealer who turns ...,23.576,/aaEJu8vFKtrAoSRtw3xjCf1aM5d.jpg,...,2005-11-09,117,"English, Español",Released,If you think you know the story... You don't k...,Get Rich or Die Tryin',"career, street gang, loss of loved one, rapper...","50 Cent, Joy Bryant, Adewale Akinnuoye-Agbaje,...","Chris Lighty, Arthur Lappin, Jim Sheridan, Con...",46442528
3996,71700,1600000,"Drama, Action, Crime, Science Fiction",,tt1580426,en,Repeaters,A gritty mind-bending thriller about three twe...,33.078,/9nQuL87o5AcUayeZM3cOXw1VbIv.jpg,...,2011-04-22,89,English,Released,,Repeaters,"time loop, recovering addict, group of friends","Dustin Milligan, Amanda Crew, Richard de Klerk...","Carl Bessai, Andrew Herwitz, Irene Nelson, Arn...",0


In [None]:
na_ids = cols['id'].to_list()

In [None]:
def update_missing_values(data, missing_ids, cols):
  for id in missing_ids:
    for col_ in cols:
      idx = data.index[data['id'] == id].to_list()[0]
      if pd.isnull(data.at[idx, col_]):
        data.at[idx, col_] = s.asyncio.run(main([id], col_))
  return data


In [None]:
new_data = update_missing_values(data, na_ids, cols)

### saving the updated data file (where features.values are missing in data)

In [None]:
new_data.to_csv('/content/drive/MyDrive/Colab Notebooks/new_data.csv')

In [None]:
Data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/new_data.csv')

In [None]:
Data.head()

Unnamed: 0.1,Unnamed: 0,id,budget,genres,movie_url,imdb_id,original_language,movie_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,keywords,cast,crew,revenue
0,0,653346,160000000,"Science Fiction, Adventure, Action",https://www.20thcenturystudios.com/movies/king...,tt11389872,en,Kingdom of the Planet of the Apes,Several generations in the future following Ca...,6245.898,...,2024-05-08,145,English,Released,No one can stop the reign.,Kingdom of the Planet of the Apes,"empire, kingdom, gorilla, dystopia, eagle, seq...","Owen Teague, Freya Allan, Kevin Durand, Peter ...","Wes Ball, Joe Hartwick Jr., Peter Chernin, Ric...",359039904
1,1,929590,50000000,"War, Action, Drama",https://a24films.com/films/civil-war,tt17279496,en,Civil War,"In the near future, a group of war journalists...",2730.901,...,2024-04-10,109,English,Released,Welcome to the frontline.,Civil War,"sniper, new york city, race against time, wash...","Kirsten Dunst, Wagner Moura, Cailee Spaeny, St...","Andrew Macdonald, Allon Reich, Gregory Goodman...",114097977
2,2,823464,150000000,"Science Fiction, Action, Adventure",https://www.godzillaxkongmovie.com,tt14539740,en,Godzilla x Kong: The New Empire,"Following their explosive showdown, Godzilla a...",2726.153,...,2024-03-27,115,English,Released,Rise together or fall alone.,Godzilla x Kong: The New Empire,"giant monster, sequel, dinosaur, monkey, kaiju...","Rebecca Hall, Brian Tyree Henry, Dan Stevens, ...","Adam Wingard, Thomas Tull, Jon Jashni, Mary Pa...",567156493
3,3,719221,8000000,"Horror, Thriller",https://www.tarotmovie.com,tt14088510,en,Tarot,When a group of friends recklessly violate the...,2349.378,...,2024-05-01,92,English,Released,Your fate is in the cards.,Tarot,"tarot cards, fate, slasher, group of friends, ...","Harriet Slater, Adain Bradley, Avantika, Jacob...","Anna Halberg, Spenser Cohen, Spenser Cohen, An...",41989392
4,4,573435,100000000,"Action, Crime, Thriller",https://www.badboys.movie,tt4919268,en,Bad Boys: Ride or Die,"After their late former Captain is framed, Low...",2486.516,...,2024-06-05,115,English,Released,Miami's finest are now its most wanted.,Bad Boys: Ride or Die,"miami, florida, on the run, police detective, ...","Will Smith, Martin Lawrence, Vanessa Hudgens, ...","George Gallo, Jerry Bruckheimer, Adil El Arbi,...",104600000


In [None]:
Data.drop(columns='Unnamed: 0', axis=1, inplace = True)

In [None]:
sum(Data.duplicated())

1

In [None]:
Data.drop_duplicates(inplace= True)

In [None]:
categorical_data = Data.select_dtypes(include= ['object'])

categorical_cols = [col for col in categorical_data.columns]

numerical_data = Data.select_dtypes(include= ['float64', 'int64'])

numerical_cols =[ num for num in numerical_data.columns]

In [None]:
categorical_cols

['genres',
 'movie_url',
 'imdb_id',
 'original_language',
 'movie_title',
 'overview',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'keywords',
 'cast',
 'crew']

## replacing the revenue of 0 to Na and empty string to Na

In [None]:
Data.replace('', np.nan, inplace= True)
Data['revenue'].replace(0, np.nan,inplace= True)

In [None]:
# pd.set_option('display.max_rows', 500)

## Mean Imputation & Mode Imputation


In [None]:
for cat_column, num_column in zip(categorical_cols, numerical_cols):
  Data[cat_column].fillna(Data[cat_column].mode()[0], inplace= True)
  Data[num_column].fillna(Data[num_column].mean(), inplace= True)

In [None]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3972 entries, 0 to 3971
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3972 non-null   int64  
 1   budget                3972 non-null   int64  
 2   genres                3972 non-null   object 
 3   movie_url             3972 non-null   object 
 4   imdb_id               3972 non-null   object 
 5   original_language     3972 non-null   object 
 6   movie_title           3972 non-null   object 
 7   overview              3972 non-null   object 
 8   popularity            3972 non-null   float64
 9   poster_path           3972 non-null   object 
 10  production_companies  3964 non-null   object 
 11  production_countries  3971 non-null   object 
 12  release_date          3972 non-null   object 
 13  runtime               3972 non-null   int64  
 14  spoken_languages      3970 non-null   object 
 15  status                3972

In [None]:
Data.isnull().sum()

id                        0
budget                    0
genres                    0
movie_url                 0
imdb_id                   0
original_language         0
movie_title               0
overview                  0
popularity                0
poster_path               0
production_companies      8
production_countries      1
release_date              0
runtime                   0
spoken_languages          2
status                    0
tagline                 149
title                     0
keywords                 31
cast                      1
crew                      0
revenue                   0
dtype: int64

In [None]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3972 entries, 0 to 3971
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3972 non-null   int64  
 1   budget                3972 non-null   int64  
 2   genres                3972 non-null   object 
 3   movie_url             3972 non-null   object 
 4   imdb_id               3972 non-null   object 
 5   original_language     3972 non-null   object 
 6   movie_title           3972 non-null   object 
 7   overview              3972 non-null   object 
 8   popularity            3972 non-null   float64
 9   poster_path           3972 non-null   object 
 10  production_companies  3964 non-null   object 
 11  production_countries  3971 non-null   object 
 12  release_date          3972 non-null   object 
 13  runtime               3972 non-null   int64  
 14  spoken_languages      3970 non-null   object 
 15  status                3972

In [None]:
Data['tagline'].replace(np.nan, 'No Tagline', inplace= True)
Data['keywords'].replace(np.nan, 'No Keywords', inplace= True)


In [None]:
Data.isnull().sum()

id                      0
budget                  0
genres                  0
movie_url               0
imdb_id                 0
original_language       0
movie_title             0
overview                0
popularity              0
poster_path             0
production_companies    8
production_countries    1
release_date            0
runtime                 0
spoken_languages        2
status                  0
tagline                 0
title                   0
keywords                0
cast                    1
crew                    0
revenue                 0
dtype: int64

In [None]:
# delete rows  where values are null following features.

Data = Data[~Data['production_companies'].isnull()]
Data = Data[~Data['production_countries'].isnull()]
Data = Data[~Data['spoken_languages'].isnull()]
Data = Data[~Data['cast'].isnull()]


In [None]:
Data.isnull().sum()

id                      0
budget                  0
genres                  0
movie_url               0
imdb_id                 0
original_language       0
movie_title             0
overview                0
popularity              0
poster_path             0
production_companies    0
production_countries    0
release_date            0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title                   0
keywords                0
cast                    0
crew                    0
revenue                 0
dtype: int64

In [None]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3961 entries, 0 to 3971
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3961 non-null   int64  
 1   budget                3961 non-null   int64  
 2   genres                3961 non-null   object 
 3   movie_url             3961 non-null   object 
 4   imdb_id               3961 non-null   object 
 5   original_language     3961 non-null   object 
 6   movie_title           3961 non-null   object 
 7   overview              3961 non-null   object 
 8   popularity            3961 non-null   float64
 9   poster_path           3961 non-null   object 
 10  production_companies  3961 non-null   object 
 11  production_countries  3961 non-null   object 
 12  release_date          3961 non-null   object 
 13  runtime               3961 non-null   int64  
 14  spoken_languages      3961 non-null   object 
 15  status                3961

### Save the data to csv file <Cleaned.csv>

In [None]:
Data.to_csv(r'/content/drive/MyDrive/Colab Notebooks/Cleaned.csv')

# **Exploratory Data Analysis --EDA**

---







In [61]:
Data = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/Cleaned.csv')

In [65]:
Data.drop('Unnamed: 0', axis =1, inplace= True)

In [8]:
! pip install plotly



In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### **Distribution of Box Office Revenue**

In [10]:
Data['log_revenue'] = np.log1p((Data['revenue']/ 1e9))

fig = px.histogram(Data, x= 'log_revenue', nbins= 30,
                   title = "Distribution of Box Office Revenue",
                   labels = {'log_revenue':'revenue'})
fig.update_layout(bargap= 0.1)
fig.show()


### **Scatter Plot: Revenue vs. Budget**

In [11]:
fig = px.scatter(Data, x= 'budget', y= 'revenue',
                 title = 'Revenue vs. Budget',
                 labels = {
                      'budget': 'budget',
                      'revenue' : 'revenue'
                 })
fig.show()

### **Bar plot: Number of Movies vs. Genres**

In [21]:
Data['genres'] = Data['genres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

# Flatten the list of genres and create a DataFrame with the counts
genres_list = Data['genres'].explode()
genre_counts = genres_list.value_counts()


In [53]:
Data_ = pd.DataFrame(genre_counts)
df =Data_.reset_index()
df.head()

Unnamed: 0,genres,count
0,Drama,1532
1,Comedy,1254
2,Action,1248
3,Thriller,1203
4,Adventure,924


In [55]:
df.shape[0]

19

In [57]:

# Plot the bar plot of genre counts with Plotly
fig = px.bar(df, x= 'count', y= 'genres', orientation='h', color= 'genres',
             title='Number of Movies by Genre',
             color_discrete_sequence=px.colors.qualitative.Plotly)

fig.update_layout(xaxis_title='Number of Movies', yaxis_title='Genre', yaxis={'categoryorder':'total ascending'})

fig.show()

### **Correlation Matrix**

In [66]:
categorical_data = Data.select_dtypes(include= ['object'])

categorical_cols = [col for col in categorical_data.columns]

numerical_data = Data.select_dtypes(include= ['float64', 'int64'])

numerical_cols =[ num for num in numerical_data.columns]

In [67]:
corr_matrix = numerical_data.corr()

fig = px.imshow(
    corr_matrix,
    labels = dict(color = 'Correlation'),
    x= corr_matrix.columns,
    y= corr_matrix.columns,
    color_continuous_scale= 'Viridis',
    title = 'Correlation Heatmap'
)

fig.show()