In [19]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor

# Load the CSV data into a Pandas DataFrame
df = pd.read_csv('../merged_data_with_revenues.csv')

# Separate rows with non-null 'Budget_x' (your training set)
train_data = df.dropna(subset=['Budget_x']).copy()  # Make a copy to avoid the warning

# Create a test set by reserving a subset of 'Budget_x' for testing
test_data = train_data.sample(frac=0.2, random_state=42).copy()  # Make a copy to avoid the warning

# Set 'Budget_y' in the test set to NaN (since we are imputing it)
test_data['Budget_y'] = None

# Choose the features for the model (e.g., 'Budget_x')
features = ['Runtime', 'Revenue_y', 'domestic_revenue', 'international_revenue', 'worldwide_revenue']

# Create a K-nearest neighbors (KNN) imputer to replace missing values
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')

# Impute missing values for the selected features for both train_data and test_data
train_data[features] = knn_imputer.fit_transform(train_data[features])
test_data[features] = knn_imputer.transform(test_data[features])

# Prepare the training and test data
X_train = train_data[features]
y_train = train_data['Budget_y']
X_test = test_data[features]

# Create and train a Random Forest Regressor (you can choose a different model)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict missing 'Budget_y' values
predicted_budget_y = model.predict(X_test)

# Assign the predictions back to the original DataFrame
df.loc[test_data.index, 'Budget_y'] = predicted_budget_y

# Save the DataFrame with imputed values
df.to_csv('imputed_data.csv', index=False)

## Doesn't work all that well. Great!