## Import Libraries & Load Data

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/restaurant-revenue-prediction/train.csv.zip
/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv
/kaggle/input/restaurant-revenue-prediction/test.csv.zip


In [32]:
INPUT_DIR = '/kaggle/input/restaurant-revenue-prediction'
df = pd.read_csv(f'{INPUT_DIR}/train.csv.zip')
test_df = pd.read_csv(f'{INPUT_DIR}/test.csv.zip')

## Step 1: Handling Missing Data

In [46]:
# Check for missing values
print(df.isnull().sum())

Id            0
City          0
City Group    0
Type          0
P1            0
P2            0
P3            0
P4            0
P5            0
P6            0
P7            0
P8            0
P9            0
P10           0
P11           0
P12           0
P13           0
P14           0
P15           0
P16           0
P17           0
P18           0
P19           0
P20           0
P21           0
P22           0
P23           0
P24           0
P25           0
P26           0
P27           0
P28           0
P29           0
P30           0
P31           0
P32           0
P33           0
P34           0
P35           0
P36           0
P37           0
revenue       0
Year          0
Month         0
Day           0
dtype: int64


### There are no missing values in the train data

## Step 1: Data Preprocessing

In [34]:
# Convert 'Open Date' to datetime
df['Open Date'] = pd.to_datetime(df['Open Date'])

# Extract features from 'Open Date'
df['Year'] = df['Open Date'].dt.year
df['Month'] = df['Open Date'].dt.month
df['Day'] = df['Open Date'].dt.day

# Drop original 'Open Date' column
df = df.drop('Open Date', axis=1)

# Separate features and target
X = df.drop('revenue', axis=1)
y = df['revenue']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_features)
    ])

## Step 2: Experiment with Regression Methods

In [35]:
# Define the models
rf = RandomForestRegressor(random_state=42)
dt = DecisionTreeRegressor(random_state=42)
svr = SVR()

# Define parameter grids for each model
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
dt_params = {'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
svr_params = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']}

# Create pipelines
rf_pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', rf)])
dt_pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', dt)])
svr_pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', svr)])

# Perform GridSearchCV for each model
rf_grid = GridSearchCV(rf_pipeline, {'regressor__' + k: v for k, v in rf_params.items()}, 
                       cv=5, scoring='neg_mean_squared_error')
dt_grid = GridSearchCV(dt_pipeline, {'regressor__' + k: v for k, v in dt_params.items()}, 
                       cv=5, scoring='neg_mean_squared_error')
svr_grid = GridSearchCV(svr_pipeline, {'regressor__' + k: v for k, v in svr_params.items()}, 
                        cv=5, scoring='neg_mean_squared_error')

# Fit the models
rf_grid.fit(X_train, y_train)
dt_grid.fit(X_train, y_train)
svr_grid.fit(X_train, y_train)



In [36]:
# Print best parameters and RMSE for each model
print("Random Forest - Best params:", rf_grid.best_params_)
print("Random Forest - RMSE:", np.sqrt(-rf_grid.best_score_))

print("Decision Tree - Best params:", dt_grid.best_params_)
print("Decision Tree - RMSE:", np.sqrt(-dt_grid.best_score_))

print("SVR - Best params:", svr_grid.best_params_)
print("SVR - RMSE:", np.sqrt(-svr_grid.best_score_))

Random Forest - Best params: {'regressor__max_depth': 20, 'regressor__n_estimators': 200}
Random Forest - RMSE: 2405639.764241817
Decision Tree - Best params: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10}
Decision Tree - RMSE: 2856170.972389731
SVR - Best params: {'regressor__C': 0.1, 'regressor__kernel': 'rbf'}
SVR - RMSE: 2327952.5753197726


## Step 3: Ensemble Predictions (One-Layer Stacking)

In [37]:
# Create base models with best parameters
rf_best = RandomForestRegressor(**{k.split('__')[1]: v for k, v in rf_grid.best_params_.items()}, 
                                random_state=42)
dt_best = DecisionTreeRegressor(**{k.split('__')[1]: v for k, v in dt_grid.best_params_.items()}, 
                                random_state=42)
svr_best = SVR(**{k.split('__')[1]: v for k, v in svr_grid.best_params_.items()})

# Create the stacked model
stacked_model = StackingRegressor(
    estimators=[('rf', rf_best), ('dt', dt_best), ('svr', svr_best)],
    final_estimator=RandomForestRegressor(random_state=42)
)

# Create a pipeline with preprocessor and stacked model
stacked_pipeline = Pipeline([('preprocessor', preprocessor), ('stacked_model', stacked_model)])

## Step 4: Hyperparameter Tuning on Stacked Model

In [38]:
# Define parameter grid for the final estimator
stacked_params = {
    'stacked_model__final_estimator__n_estimators': [50, 100],
    'stacked_model__final_estimator__max_depth': [None, 10, 20]
}

# Perform GridSearchCV on the stacked model
stacked_grid = GridSearchCV(stacked_pipeline, stacked_params, cv=5, scoring='neg_mean_squared_error')
stacked_grid.fit(X_train, y_train)

# Print best parameters and RMSE for the stacked model
print("Stacked Model - Best params:", stacked_grid.best_params_)
print("Stacked Model - RMSE:", np.sqrt(-stacked_grid.best_score_))

# Evaluate the final model on the test set
y_pred = stacked_grid.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Final Test RMSE:", final_rmse)



Stacked Model - Best params: {'stacked_model__final_estimator__max_depth': 10, 'stacked_model__final_estimator__n_estimators': 50}
Stacked Model - RMSE: 2604483.643750342
Final Test RMSE: 3546816.0918153767




## Step 5: Make Train & Test data columns the same to fit the Ensemble model

In [39]:
date_column = [col for col in test_df.columns if 'date' in col.lower() or 'open' in col.lower()]
if date_column:
    date_column = date_column[0]
    test_df[date_column] = pd.to_datetime(test_df[date_column])
    test_df['Year'] = test_df[date_column].dt.year
    test_df['Month'] = test_df[date_column].dt.month
    test_df['Day'] = test_df[date_column].dt.day
    test_df = test_df.drop(date_column, axis=1)
else:
    print("No date column found in the test dataset.")

In [40]:
missing_cols = set(X.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0  # or any other appropriate default value

X_test = test_df[X.columns]

## Step 6: Generate Final Prediction Values & submission file

In [41]:
# Make predictions
y_pred_test = stacked_grid.predict(X_test)



In [42]:
# Create submission dataframe
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Prediction': y_pred_test
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

## Step 7: Make Predictions for each model

In [43]:
# Make predictions for each model
rf_pred = rf_grid.predict(X_test)
dt_pred = dt_grid.predict(X_test)
svr_pred = svr_grid.predict(X_test)



In [44]:
# Create submission dataframes
rf_submission = pd.DataFrame({'Id': test_df['Id'], 'Prediction': rf_pred})
dt_submission = pd.DataFrame({'Id': test_df['Id'], 'Prediction': dt_pred})
svr_submission = pd.DataFrame({'Id': test_df['Id'], 'Prediction': svr_pred})

In [20]:
rf_submission.to_csv('submission.csv', index=False)

In [21]:
dt_submission.to_csv('submission.csv', index=False)

In [22]:
svr_submission.to_csv('submission.csv', index=False)

## EXTRA POINTS- SELECT TOP 10 FEATURES USING SelectFromModel

In [52]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

# Get the best estimator from the GridSearchCV result
best_estimator = rf_grid.best_estimator_

# Access the Random Forest Regressor from the pipeline
rf_regressor = best_estimator.named_steps['regressor']

# Create the SelectFromModel object
selector = SelectFromModel(rf_regressor, max_features=10, threshold=-np.inf)

# Fit the selector to the preprocessed training data
X_train_preprocessed = best_estimator.named_steps['preprocessor'].transform(X_train)
selector.fit(X_train_preprocessed, y_train)

# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get the names of the selected features
feature_names = best_estimator.named_steps['preprocessor'].get_feature_names_out()
selected_features = feature_names[selected_feature_indices]

print("Top 10 selected features:")
for feature in selected_features:
    print(feature)

Top 10 selected features:
num__Id
num__P1
num__P12
num__P17
num__P20
num__P22
num__P23
num__P28
num__P29
cat__City_İstanbul
