
The purpose of this file is to impute values for budget based on a round robin regression on all features of the dataset.
---
Source:
https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html


In [2]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd
import csv
import numpy as np
from math import exp
import ast
from sklearn.ensemble import RandomForestRegressor

In [1]:
import sklearn
print(sklearn.__version__)

1.1.1


In [3]:
# Initialize csv file paths
csv_file_path= 'final_test_dataset_with_categorical.csv'
# Create DataFrames from both CSV files
df = pd.read_csv(csv_file_path)

In [4]:
# Assuming 'df' is your existing dataframe
df['Merged Budget'] = df['Merged Budget'].replace(100000, np.nan)

selected_columns = [
    'IMDB Budget', 'TMDB Budget', 'TMDB Revenue', 'Runtime', 'IMDB Domestic Revenue',
    'international_revenue', 'worldwide_revenue', 'Merged Budget', 'Merged Revenue',
    'Release Year', 'Unweighted Star Score', 'Simple Weight Star Score',
    'Log Weight Star Score', 'Exponential Weight Star Score',
    'Movie Contribution to Director and Production Scores', 'Total Director Score',
    'Avg Director Score', 'Total Production Company Score',
    'Avg Production Company Score', 'Genre Cluster', 'Domestic Distributor ID',
    'Domestic Distributor Score', 'G', 'NC-17', 'NR', 'PG', 'PG-13', 'R',
    'Adj Merged Budget', 'Adj Merged Revenue', 'Has Star Score',
    'Has Director Score', 'Has Production Company Score',
    'Has Domestic Distributor Score', 'Unweighted Star Score_normalized',
    'Simple Weight Star Score_normalized', 'Log Weight Star Score_normalized',
    'Exponential Weight Star Score_normalized', 'Total Director Score_normalized',
    'Avg Director Score_normalized', 'Total Production Company Score_normalized',
    'Avg Production Company Score_normalized', 'Domestic Distributor Score_normalized'
]
fit_df = df[selected_columns]
# Remove rows where 'Merged Budget' is NaN
fit_df = fit_df.dropna(subset=['Merged Budget'])

# Create a new dataframe with selected columns, skipping the first row
new_df = df[selected_columns]
# Replace values equal to 100000 in the 'Merged Budget' column with empty values

# Display the updated dataframe
new_df.head()  # Use .head() to display the first few rows for verification


Unnamed: 0,IMDB Budget,TMDB Budget,TMDB Revenue,Runtime,IMDB Domestic Revenue,international_revenue,worldwide_revenue,Merged Budget,Merged Revenue,Release Year,...,Has Domestic Distributor Score,Unweighted Star Score_normalized,Simple Weight Star Score_normalized,Log Weight Star Score_normalized,Exponential Weight Star Score_normalized,Total Director Score_normalized,Avg Director Score_normalized,Total Production Company Score_normalized,Avg Production Company Score_normalized,Domestic Distributor Score_normalized
0,150000000,150000000,1274219009,102,400953009,883587509,1284541000.0,150000000.0,400953009,2023,...,1,0.644086,-0.428853,-0.788576,-1.231288,0.921407,-1.003354,0.826698,1.138798,0.892038
1,0,150000000,690860472,107,248757044,438471864,687228900.0,150000000.0,248757044,2023,...,1,0.700711,0.814067,0.937757,0.892256,-0.198945,0.808867,0.826698,0.903856,0.892038
2,0,10000000,78966486,94,48958273,30028205,78986480.0,10000000.0,48958273,2023,...,1,0.862544,0.55847,0.056148,0.201369,0.706923,0.123777,0.205614,0.247684,0.676854
3,0,800000,0,93,325583,249596,575179.0,800000.0,325583,2023,...,1,-1.232445,-1.327973,-1.318686,-1.037391,0.024811,-0.966397,0.055264,0.047158,0.28875
4,0,0,0,91,1459337,1459337,,,1459337,2022,...,1,-0.449184,-0.357243,-0.049826,-0.996807,0.697694,-0.621639,-0.559957,-0.514511,0.46261


In [5]:
# DO NOT run this cell unless you are testing estimators
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

N_SPLITS = 5

# Assuming 'fit_df' is your existing DataFrame

# Dummy y values for the sake of the example
y_values = np.random.random(size=len(fit_df))

# Replace NaNs in the DataFrame with numpy NaNs
fit_df.replace('', np.nan, inplace=True)

fit_df = fit_df.iloc[:1000]  # Taking the first 1000 samples for training
y_values = y_values[:1000]

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
fit_df = imputer.fit_transform(fit_df)

n_samples, n_features = fit_df.shape

# Estimate the score on the entire dataset, with no missing values
br_estimator = BayesianRidge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, fit_df, y_values, scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)

# Add a single missing value to each row
fit_df_missing = fit_df.copy()
missing_samples = np.arange(n_samples)
missing_features = np.random.choice(n_features, n_samples, replace=True)
fit_df_missing[missing_samples, missing_features] = np.nan

# Estimate the score after imputation (mean and median strategies)
score_simple_imputer = pd.DataFrame()
for strategy in ("mean", "median"):
    estimator = make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
    )
    score_simple_imputer[strategy] = cross_val_score(
        estimator, fit_df_missing, y_values, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    RandomForestRegressor(n_estimators=4, max_depth=10, random_state=0),
    make_pipeline(Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)),
    KNeighborsRegressor(n_neighbors=15),
]
score_iterative_imputer = pd.DataFrame()
tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            random_state=0, estimator=impute_estimator, max_iter=50, tol=tol
        ),
        br_estimator,
    )
    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
        estimator, fit_df_missing, y_values, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

scores = pd.concat(
    [score_full_data, score_simple_imputer, score_iterative_imputer],
    keys=["Original", "SimpleImputer", "IterativeImputer"],
    axis=1,
)

# plot results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_title("Regression with Different Imputation Methods")
ax.set_xlabel("MSE (smaller is better)")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.show()


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\impute\_iterative.py", line 665, in fit_transform
    Xt, estimator = self._impute_one_feature(
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\impute\_iterative.py", line 318, in _impute_one_feature
    estimator.fit(X_train, y_train)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_ridge.py", line 1130, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_ridge.py", line 889, in fit
    self.coef_, self.n_iter_ = _ridge_regression(
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_ridge.py", line 699, in _ridge_regression
    coef = _solve_cholesky(X, y, alpha)
  File "c:\Users\Sam Oberly\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_ridge.py", line 212, in _solve_cholesky
    return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
TypeError: solve() got an unexpected keyword argument 'sym_pos'


In [None]:
imp = IterativeImputer(min_value=0, max_iter=100, random_state=1, estimator=RandomForestRegressor())
imp.fit(fit_df)
IterativeImputer(random_state=1)
imp_df = imp.transform(new_df)
imp_df = pd.DataFrame(imp_df, columns=new_df.columns)
imp_df.head()

Unnamed: 0,IMDB Budget,TMDB Budget,TMDB Revenue,Runtime,IMDB Domestic Revenue,international_revenue,worldwide_revenue,Merged Budget,Merged Revenue,Release Year,...,Has Domestic Distributor Score,Unweighted Star Score_normalized,Simple Weight Star Score_normalized,Log Weight Star Score_normalized,Exponential Weight Star Score_normalized,Total Director Score_normalized,Avg Director Score_normalized,Total Production Company Score_normalized,Avg Production Company Score_normalized,Domestic Distributor Score_normalized
0,150000000.0,150000000.0,1274219000.0,102.0,400953009.0,883587509.0,1284541000.0,150000000.0,400953009.0,2023.0,...,1.0,0.644086,-0.428853,-0.788576,-1.231288,0.921407,-1.003354,0.826698,1.138798,0.892038
1,0.0,150000000.0,690860500.0,107.0,248757044.0,438471864.0,687228900.0,150000000.0,248757044.0,2023.0,...,1.0,0.700711,0.814067,0.937757,0.892256,-0.198945,0.808867,0.826698,0.903856,0.892038
2,0.0,10000000.0,78966490.0,94.0,48958273.0,30028205.0,78986480.0,10000000.0,48958273.0,2023.0,...,1.0,0.862544,0.55847,0.056148,0.201369,0.706923,0.123777,0.205614,0.247684,0.676854
3,0.0,800000.0,0.0,93.0,325583.0,249596.0,575179.0,800000.0,325583.0,2023.0,...,1.0,-1.232445,-1.327973,-1.318686,-1.037391,0.024811,-0.966397,0.055264,0.047158,0.28875
4,0.0,0.0,0.0,91.0,1459337.0,1459337.0,1968466.0,502500.0,1459337.0,2022.0,...,1.0,-0.449184,-0.357243,-0.049826,-0.996807,0.697694,-0.621639,-0.559957,-0.514511,0.46261


In [None]:
fin_df = pd.read_csv(csv_file_path)

# Selecting only the 'Merged Budget' column from imp_df
imp_df_merged_budget = imp_df[['Merged Budget']]

# Replace 'Merged Budget' column in 'fin_df' with the values from 'imp_df_merged_budget'
fin_df['Merged Budget'] = imp_df_merged_budget['Merged Budget']
imp_df.head()
# Save the modified DataFrame 'fin_df' as a CSV file

# Uncomment the next line to save
fin_df.to_csv('imputed_budget_test.csv', index=False)