In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import gradio as gr


ModuleNotFoundError: No module named 'gradio'

In [None]:
pip install gradio

In [None]:
# Load dataset from local CSV file
df = pd.read_csv("movie_metadata.csv")

df.head()


In [None]:
df.info()
df.describe()


In [None]:
df = df[[
    'imdb_score',
    'budget',
    'gross',
    'num_voted_users',
    'movie_facebook_likes',
    'genres'
]]


In [None]:
df['budget'] = df['budget'].fillna(df['budget'].median())
df['gross'] = df['gross'].fillna(df['gross'].median())
df['num_voted_users'] = df['num_voted_users'].fillna(df['num_voted_users'].median())
df['movie_facebook_likes'] = df['movie_facebook_likes'].fillna(0)

df['genres'] = df['genres'].fillna('Unknown')


In [None]:
df['first_genre'] = df['genres'].apply(lambda x: x.split('|')[0])


In [None]:
sns.histplot(df['imdb_score'], bins=20)
plt.title("IMDb Score Distribution")
plt.show()


In [None]:
sns.scatterplot(x='budget', y='imdb_score', data=df)
plt.title("Budget vs IMDb Score")
plt.show()


In [None]:
genre_avg = df.groupby('first_genre')['imdb_score'].mean().sort_values(ascending=False)

genre_avg.head(10).plot(kind='bar', figsize=(10,5))
plt.title("Average IMDb Score by Genre")
plt.show()


In [None]:
# Remove original genres column
df_model = df.drop(['genres'], axis=1)

# Convert genre to dummy variables
df_model = pd.get_dummies(df_model, columns=['first_genre'], drop_first=True)

# Features & target
X = df_model.drop('imdb_score', axis=1)
y = df_model['imdb_score']


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Linear Regression RÂ² Score:", r2_score(y_test, y_pred_lr))
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_lr))


In [None]:
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest RÂ² Score:", r2_score(y_test, y_pred_rf))
print("Random Forest MSE:", mean_squared_error(y_test, y_pred_rf))


In [None]:
# List of available genres
genre_columns = [col.replace("first_genre_", "") for col in X.columns if col.startswith("first_genre_")]


In [None]:
def predict_rating(budget, gross, votes, likes, genre):
    input_data = {
        'budget': budget,
        'gross': gross,
        'num_voted_users': votes,
        'movie_facebook_likes': likes
    }

    for col in X.columns:
        if col.startswith("first_genre_"):
            input_data[col] = 1 if col == f"first_genre_{genre}" else 0

    input_df = pd.DataFrame([input_data])
    prediction = rf.predict(input_df)[0]
    return round(prediction, 2)


In [None]:
app = gr.Interface(
    fn=predict_rating,
    inputs=[
        gr.Number(label="Budget"),
        gr.Number(label="Gross"),
        gr.Number(label="Number of Votes"),
        gr.Number(label="Facebook Likes"),
        gr.Dropdown(genre_columns, label="Genre")
    ],
    outputs=gr.Text(label="Predicted IMDb Rating"),
    title="ðŸŽ¬ Movie Rating Prediction App",
    description="Beginner Machine Learning Project using Random Forest"
)

app.launch()
