### Prerequisites

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#### Data and Feature Extraction

In [26]:
# Load data
df = pd.read_csv('../data/movies_cleaned.csv')
# print(df.head())

# Log Normalization
df['Log_Votes'] = np.log1p(df['Votes'])

# Combine features and target
features = ['Duration', 'Log_Votes', 'Year']
model_data = df[features + ['Rating']].dropna()

X = model_data[features]
y = model_data['Rating']

# using random state for reproducability
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

#### Training and Evaluation

##### Linear Regression Model

In [27]:
# Train and fit model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R-Squared Score: {r2:.2f}")

MSE: 1.72
R-Squared Score: 0.11


##### Random Forest Model

In [28]:
model = RandomForestRegressor(n_estimators=100, random_state=13,n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R-Squared Score: {r2:.2f}")

MSE: 1.50
R-Squared Score: 0.23


##### One-hot Encode Genre


In [29]:
#### TODO (if there's time)