In [8]:
# Step 1: Installing necessary libraries
!pip install pandas scikit-learn xgboost

# Step 2: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
import numpy as np

# Step 3: Load the data
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')

# Step 4: Data Preprocessing
X = train_data.drop(columns=['Annual Turnover'])
y = train_data['Annual Turnover']
X_test = test_data.drop(columns=['Registration Number'])

# Drop non-important columns if they exist
X = X.drop(columns=['City', 'Cuisine'], errors='ignore')
X_test = X_test.drop(columns=['City', 'Cuisine'], errors='ignore')

# One-hot encode categorical variables
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align the columns of training and test datasets
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

# Step 5: Splitting the Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Impute missing values only for numeric columns
numeric_cols = X_train.select_dtypes(include=['number']).columns
numeric_cols = numeric_cols.drop('Registration Number', errors='ignore')
imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

# Feature Engineering
# Create interaction features
print(X_train.columns)
X_train['Interaction_Feature'] = X_train['Registration Number'] * X_train['Overall Restaurant Rating']

# Feature Selection
# Example: Select top k most important features
# Fit a Gradient Boosting model
gb_reg = GradientBoostingRegressor(random_state=42)
gb_reg.fit(X_train, y_train)

# Select top 10 most important features
selector = SelectFromModel(gb_reg, max_features=10)
selector.fit(X_train, y_train)

# Transform the data
X_train_selected = selector.transform(X_train)

# Include 'Interaction_Feature' in the columns of X_val
X_val['Interaction_Feature'] = X_val['Registration Number'] * X_val['Overall Restaurant Rating']

# Check the columns of the validation set
print("Columns of X_val before transformation:", X_val.columns)

# Transform the validation set
X_val_selected = selector.transform(X_val)

# Print the columns of the transformed validation set
print("Columns of X_val after transformation:", X_val_selected.shape[1])

# Model Ensemble
# Initialize individual models
rf_reg = RandomForestRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)

# Train individual models
rf_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)

# Make predictions
rf_preds = rf_reg.predict(X_val)
xgb_preds = xgb_reg.predict(X_val)

# Combine predictions (simple averaging)
ensemble_preds = (rf_preds + xgb_preds) / 2

# Evaluate ensemble model
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_preds))
print("Ensemble Model Validation RMSE:", ensemble_rmse)

# Step 7: Model Building - Gradient Boosting Regression with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

gb_reg = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=gb_reg, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=2)
grid_search.fit(X_train_selected, y_train)

best_params = grid_search.best_params_
gb_reg_best = GradientBoostingRegressor(**best_params, random_state=42)
gb_reg_best.fit(X_train_selected, y_train)

# Step 8: Model Evaluation
train_predictions_best = gb_reg_best.predict(X_train_selected)
val_predictions_best = gb_reg_best.predict(X_val_selected)

train_rmse_best = np.sqrt(mean_squared_error(y_train, train_predictions_best))
val_rmse_best = np.sqrt(mean_squared_error(y_val, val_predictions_best))

print("Train RMSE with best parameters:", train_rmse_best)
print("Validation RMSE with best parameters:", val_rmse_best)

if 'Interaction_Feature' not in X_test.columns:
    X_test['Interaction_Feature'] = X_test['Registration Number'] * X_test['Overall Restaurant Rating']

# Transform the test data using the same feature selection
X_test_selected = selector.transform(X_test)

# Step 9: Making Predictions
test_predictions_best = gb_reg_best.predict(X_test_selected)

# Step 10: Creating Submission File
submission_df = pd.DataFrame({'Registration Number': test_data['Registration Number'], 'Annual Turnover': test_predictions_best})
submission_df.to_csv('submission_greadsearch_featureenginnering_hypertunning.csv', index=False)


You should consider upgrading via the 'c:\users\rahulbhave\code\gl_hackathon_restaurent_turnover_prediction\env\scripts\python.exe -m pip install --upgrade pip' command.


Index(['Registration Number', 'Facebook Popularity Quotient',
       'Instagram Popularity Quotient', 'Fire Audit',
       'Liquor License Obtained', 'Situated in a Multi Complex',
       'Dedicated Parking', 'Open Sitting Available', 'Resturant Tier',
       'Restaurant Zomato Rating',
       ...
       'Restaurant Theme_Parsi', 'Restaurant Theme_Petit',
       'Restaurant Theme_Picante', 'Restaurant Theme_Piquant',
       'Restaurant Theme_Resca', 'Restaurant Theme_Sage',
       'Restaurant Theme_Savory', 'Restaurant Theme_Spoon',
       'Restaurant Theme_TheGem', 'Restaurant Theme_Umami'],
      dtype='object', length=1847)
Columns of X_val before transformation: Index(['Registration Number', 'Facebook Popularity Quotient',
       'Instagram Popularity Quotient', 'Fire Audit',
       'Liquor License Obtained', 'Situated in a Multi Complex',
       'Dedicated Parking', 'Open Sitting Available', 'Resturant Tier',
       'Restaurant Zomato Rating',
       ...
       'Restaurant Theme_P