<a href="https://colab.research.google.com/github/salehgondal/movie_success_prediction/blob/main/Project_Saira.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Fetching

In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import polars as pl
import plotly.express as px
pl.Config.set_fmt_float("full") # to get rid of scientific notation while printing stuff
pl.Config.set_tbl_width_chars(300)


In [None]:
# make sure you are storing your tmdb data in this directory
os.makedirs('/content/drive/MyDrive/project_dva', exist_ok=True)
# Change directory
os.chdir('/content/drive/MyDrive/project_dva')
# Fetching our tmdb data
df = pl.read_csv("TMDB_movie_dataset_v11.csv")

In [None]:
df_orig = df.clone()

In [None]:
with pl.Config(tbl_cols=-1):
  print(df.head(5))


shape: (5, 24)
┌────────┬─────────────┬─────────────┬────────────┬──────────┬─────────────┬────────────┬─────────┬───────┬────────────┬───────────┬────────────┬───────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ id     ┆ title       ┆ vote_averag ┆ vote_count ┆ status   ┆ release_dat ┆ revenue    ┆ runtime ┆ adult ┆ backdrop_p ┆ budget    ┆ homepage   ┆ imdb_id   ┆ original_l ┆ original_t ┆ overview   ┆ popularity ┆ poster_pat ┆ tagline    ┆ genres     ┆ production ┆ production ┆ spoken_lan ┆ keywords   │
│ ---    ┆ ---         ┆ e           ┆ ---        ┆ ---      ┆ e           ┆ ---        ┆ ---     ┆ ---   ┆ ath        ┆ ---       ┆ ---        ┆ ---       ┆ anguage    ┆ itle       ┆ ---        ┆ ---        ┆ h          ┆ ---        ┆ ---        ┆ _companies ┆ _countries ┆ guages     ┆ ---        │
│ i64    ┆ str         ┆ ---         ┆ i64        ┆ str      ┆ ---         ┆ i64  

# Data Sanity and Cleaning

In [None]:
# checking for nulls
df.null_count()
# No missing data found

id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Getting numeric columns for some EDA
numeric_columns = [col for col in df.columns if df[col].dtype not in (pl.Utf8, pl.Categorical)]
numeric_columns.remove('id') # id not needed


In [None]:

# Checking 0 count here
zero_counts = df.select(numeric_columns).select([
    (pl.col(col).eq(0).sum() / pl.count() * 100).round(2).alias(f"{col}_zero_percentage")  for col in numeric_columns
])

print(zero_counts)



shape: (1, 7)
┌──────────────────────────────┬────────────────────────────┬─────────────────────────┬─────────────────────────┬───────────────────────┬────────────────────────┬────────────────────────────┐
│ vote_average_zero_percentage ┆ vote_count_zero_percentage ┆ revenue_zero_percentage ┆ runtime_zero_percentage ┆ adult_zero_percentage ┆ budget_zero_percentage ┆ popularity_zero_percentage │
│ ---                          ┆ ---                        ┆ ---                     ┆ ---                     ┆ ---                   ┆ ---                    ┆ ---                        │
│ f64                          ┆ f64                        ┆ f64                     ┆ f64                     ┆ f64                   ┆ f64                    ┆ f64                        │
╞══════════════════════════════╪════════════════════════════╪═════════════════════════╪═════════════════════════╪═══════════════════════╪════════════════════════╪════════════════════════════╡
│ 70.53                   


`pl.count()` is deprecated. Please use `pl.len()` instead.



**Too many 0s. removing data that has 0 vote_average**

In [None]:
df = df.filter(pl.col("vote_average")>0)
df.shape

(352005, 24)

In [None]:
# Checking summary of numeric columns

desc = df.select(numeric_columns).describe()
desc = pl.DataFrame(desc)
summary_df_rounded = desc.with_columns([
    pl.col(pl.Float64).round(2)  # Round all float64 columns to 2 decimal places
])


print(summary_df_rounded)



shape: (9, 8)
┌────────────┬──────────────┬────────────┬─────────────┬─────────┬────────┬───────────┬────────────┐
│ statistic  ┆ vote_average ┆ vote_count ┆ revenue     ┆ runtime ┆ adult  ┆ budget    ┆ popularity │
│ ---        ┆ ---          ┆ ---        ┆ ---         ┆ ---     ┆ ---    ┆ ---       ┆ ---        │
│ str        ┆ f64          ┆ f64        ┆ f64         ┆ f64     ┆ f64    ┆ f64       ┆ f64        │
╞════════════╪══════════════╪════════════╪═════════════╪═════════╪════════╪═══════════╪════════════╡
│ count      ┆ 352005       ┆ 352005     ┆ 352005      ┆ 352005  ┆ 352005 ┆ 352005    ┆ 352005     │
│ null_count ┆ 0            ┆ 0          ┆ 0           ┆ 0       ┆ 0      ┆ 0         ┆ 0          │
│ mean       ┆ 6.11         ┆ 60.94      ┆ 2082805.63  ┆ 70      ┆ 0.06   ┆ 782979.73 ┆ 2.59       │
│ std        ┆ 1.97         ┆ 570.11     ┆ 29807651.61 ┆ 65.09   ┆ null   ┆ 7784833.2 ┆ 13.38      │
│ min        ┆ 0.5          ┆ 0          ┆ -12         ┆ 0       ┆ 0      ┆ 0

In [None]:
# Checking empty values in character columns
char_columns = [col for col in df.columns if df[col].dtype in (pl.Utf8, pl.Categorical)]


empty_counts = df.select(char_columns).select([
    (pl.col(col).eq("").sum() / pl.len() * 100).round(2).alias(f"{col}")  for col in char_columns
])

with pl.Config(tbl_cols=-1):
    print(empty_counts)

shape: (1, 16)
┌───────┬────────┬──────────────┬───────────────┬──────────┬─────────┬───────────────────┬────────────────┬──────────┬─────────────┬─────────┬────────┬──────────────────────┬──────────────────────┬──────────────────┬──────────┐
│ title ┆ status ┆ release_date ┆ backdrop_path ┆ homepage ┆ imdb_id ┆ original_language ┆ original_title ┆ overview ┆ poster_path ┆ tagline ┆ genres ┆ production_companies ┆ production_countries ┆ spoken_languages ┆ keywords │
│ ---   ┆ ---    ┆ ---          ┆ ---           ┆ ---      ┆ ---     ┆ ---               ┆ ---            ┆ ---      ┆ ---         ┆ ---     ┆ ---    ┆ ---                  ┆ ---                  ┆ ---              ┆ ---      │
│ f64   ┆ f64    ┆ f64          ┆ f64           ┆ f64      ┆ f64     ┆ f64               ┆ f64            ┆ f64      ┆ f64         ┆ f64     ┆ f64    ┆ f64                  ┆ f64                  ┆ f64              ┆ f64      │
╞═══════╪════════╪══════════════╪═══════════════╪══════════╪═════════╪═══

#EDA

In [None]:
# EDA Plots
agg_df = df.group_by("production_countries").agg([
    pl.col("id").count().alias("count"),
    (pl.col("vote_average").eq(0).sum() / pl.count() * 100).alias("vote_average_zeros_percentage")
])
agg_df=agg_df.sort("count", descending=True)
#agg_df = agg_df.filter(pl.col("count")>10000) #pick only rows with count over 1k
agg_df = agg_df.head(10)
pandas_df = agg_df.to_pandas()

# Create a bar plot using Plotly
fig = px.bar(pandas_df, x="production_countries", y="count", title="Count by production_countries")
fig.show()

fig2 = px.bar(pandas_df, x="production_countries", y="vote_average_zeros_percentage", title="vote_average zeros percentage by production_countries")
fig2.show()




`pl.count()` is deprecated. Please use `pl.len()` instead.



**Why so many empty values in production_countries**

In [None]:
# Check correlation of vote average with revenue
df_filtered = df.filter((pl.col("vote_average") > 0) & (pl.col("revenue") > 0) & (pl.col("budget") > 0) & (pl.col("vote_count") > 100))
correlation = df_filtered.select([
    pl.col("vote_average"),
    pl.col("revenue")/pl.col("budget")
]).corr()
correlation


vote_average,revenue
f64,f64
1.0,0.0071096264014686
0.0071096264014686,1.0
