In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import shap
import matplotlib.pyplot as plt

# 1. Load and prepare data
print("Loading data...")
df = pd.read_csv('/Users/razedori/BYU/Stat 486/project movies/imdb.csv')
df = df.dropna(subset=['Your Rating'])

# Select features
features = [
    'Runtime (mins)',
    'IMDb Rating',
    'Year',
    'Num Votes'
]

X = df[features].copy()
y = df['Your Rating'].copy()

# 2. Basic preprocessing for PCA and clustering
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
X_processed = pd.DataFrame(
    scaler.fit_transform(imputer.fit_transform(X)),
    columns=X.columns
)

# 3. Apply PCA
print("\nApplying PCA...")
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_processed)
print(f"Number of components selected by PCA: {pca.n_components_}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

# 4. Perform clustering
print("\nPerforming cluster analysis...")
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Visualize clusters
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis')
plt.title('Movie Clusters based on PCA')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar()
plt.show()

# 5. Nested CV function
def perform_nested_cv(X, y, model, param_grid, name, outer_cv=3, inner_cv=2):
    """
    Perform nested cross-validation with preprocessing pipeline
    """
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    pipeline_param_grid = {f'model__{key}': value for key, value in param_grid.items()}
    
    outer_scores = []
    outer_mse = []
    cv_outer = KFold(n_splits=outer_cv, shuffle=True, random_state=42)
    
    for fold, (train_idx, test_idx) in enumerate(cv_outer.split(X), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        cv_inner = KFold(n_splits=inner_cv, shuffle=True, random_state=42)
        
        search = GridSearchCV(
            estimator=pipeline,
            param_grid=pipeline_param_grid,
            cv=cv_inner,
            scoring='r2',
            n_jobs=-1
        )
        
        search.fit(X_train, y_train)
        predictions = search.predict(X_test)
        
        score = r2_score(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        
        outer_scores.append(score)
        outer_mse.append(mse)
        
        print(f"{name} - Fold {fold} Score: R² = {score:.3f}, MSE = {mse:.3f}")
    
    print(f"\n{name} Final Results:")
    print(f"Average R²: {np.mean(outer_scores):.3f} (±{np.std(outer_scores):.3f})")
    print(f"Average MSE: {np.mean(outer_mse):.3f}")
    
    return np.mean(outer_scores), np.mean(outer_mse)

# 6. Define models and parameters
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'min_samples_split': [2]
    },
    'Ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'Lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'SVR': {
        'C': [1.0, 10.0],
        'kernel': ['rbf']
    }
}

models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'SVR': SVR()
}

# 7. Perform nested CV
print("\nPerforming Nested Cross-Validation...")
nested_cv_results = {}

for name, model in models.items():
    print(f"\nEvaluating {name}...")
    avg_r2, avg_mse = perform_nested_cv(
        X, y, model, param_grids[name], name
    )
    nested_cv_results[name] = {
        'R2': avg_r2,
        'MSE': avg_mse
    }

# 8. Train final Random Forest for SHAP analysis
rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])
rf_pipeline.fit(X, y)
rf_model = rf_pipeline.named_steps['rf']

# 9. SHAP Analysis
print("\nPerforming SHAP analysis...")
explainer = shap.TreeExplainer(rf_model)
X_processed = rf_pipeline.named_steps['imputer'].transform(X)
X_processed = rf_pipeline.named_steps['scaler'].transform(X_processed)
X_processed = pd.DataFrame(X_processed, columns=X.columns)
shap_values = explainer.shap_values(X_processed)

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_processed, plot_type="bar", show=False)
plt.title("SHAP Feature Importance")
plt.tight_layout()
plt.show()

# 10. Visualize model comparison
plt.figure(figsize=(10, 6))
model_names = list(nested_cv_results.keys())
r2_scores = [results['R2'] for results in nested_cv_results.values()]

plt.bar(model_names, r2_scores)
plt.title('Model Performance Comparison (Nested CV)')
plt.xlabel('Models')
plt.ylabel('Average R² Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 11. Create prediction function
def predict_movie_rating(runtime, imdb_rating, year, num_votes):
    """Predict rating for a new movie using only the Ridge model."""
    movie_features = pd.DataFrame({
        'Runtime (mins)': [runtime],
        'IMDb Rating': [imdb_rating],
        'Year': [year],
        'Num Votes': [num_votes]
    })
    
    # Use the Ridge model
    ridge_model = Ridge(random_state=42)
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', ridge_model)
    ])
    
    # Train the Ridge model on the entire dataset
    pipeline.fit(X, y)
    
    # Make prediction
    predicted_rating = pipeline.predict(movie_features)[0]
    print(f"Predicted Rating (Ridge): {predicted_rating:.2f}")
    return predicted_rating



In [None]:
def predict_multiple_movies(movies, model_choice='Ridge'):
    """
    Predict ratings for multiple movies using the specified model.
    
    Parameters:
        movies (list of dict): A list of dictionaries, each containing:
            - 'Title': Title of the movie
            - 'Runtime (mins)': Runtime of the movie in minutes
            - 'IMDb Rating': IMDb rating of the movie
            - 'Year': Release year of the movie
            - 'Num Votes': Number of votes the movie received
        model_choice (str): Model to use for prediction ('Ridge', 'Lasso', etc.)
        
    Returns:
        dict: A dictionary with movie titles as keys and predicted ratings as values.
    """
    # Check if the chosen model exists
    if model_choice not in models:
        raise ValueError(f"Model '{model_choice}' not found. Choose from: {list(models.keys())}")
    
    # Extract features from the movie data
    movie_features = pd.DataFrame([
        {
            'Runtime (mins)': movie['Runtime (mins)'],
            'IMDb Rating': movie['IMDb Rating'],
            'Year': movie['Year'],
            'Num Votes': movie['Num Votes']
        }
        for movie in movies
    ])
    
    # Titles for differentiation
    titles = [movie['Title'] for movie in movies]
    
    # Train the chosen model
    model = models[model_choice]
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipeline.fit(X, y)
    
    # Make predictions
    predictions = pipeline.predict(movie_features)
    
    # Combine titles with predictions
    results = {title: rating for title, rating in zip(titles, predictions)}
    
    return results



In [None]:
test_movies = [
    {
        'Title': 'About Time',
        'Runtime (mins)': 123,
        'IMDb Rating': 7.4,
        'Year': 2013,
        'Num Votes': 4020000
    },
    {
        'Title': 'The Lighthouse',
        'Runtime (mins)': 149,
        'IMDb Rating': 7.4,
        'Year': 2019,
        'Num Votes': 269000
    },
    {
        'Title': 'The Substance',
        'Runtime (mins)': 141,
        'IMDb Rating': 7.4,
        'Year': 2024,
        'Num Votes': 161000
    },
     {
        'Title': 'A nightmare on elm street',
        'Runtime (mins)': 91,
        'IMDb Rating': 7.4,
        'Year': 1984,
        'Num Votes': 213000
    }
]

predicted_ratings = predict_multiple_movies(test_movies, model_choice='Ridge')

print("Predicted Ratings:")
for title, rating in predicted_ratings.items():
    print(f"{title}: {rating:.2f}")
