In [4]:
# Step 1: Installing necessary libraries
!pip install pandas scikit-learn

# Step 2: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Step 3: Load the data
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')

# Step 4: Data Preprocessing
X = train_data.drop(columns=['Annual Turnover'])
y = train_data['Annual Turnover']
X_test = test_data.drop(columns=['Registration Number'])

# Drop non-important columns if they exist
X = X.drop(columns=['City', 'Cuisine'], errors='ignore')
X_test = X_test.drop(columns=['City', 'Cuisine'], errors='ignore')

# One-hot encode categorical variables
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align the columns of training and test datasets
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

# Step 5: Splitting the Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Impute missing values only for numeric columns
numeric_cols = X_train.select_dtypes(include=['number']).columns
numeric_cols = numeric_cols.drop('Registration Number', errors='ignore')
imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

# Step 7: Feature Selection
gb_reg = GradientBoostingRegressor(random_state=42)
selector = SelectFromModel(gb_reg, max_features=500)
selector.fit(X_train, y_train)

# Transform the data
X_train_selected = selector.transform(X_train)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Step 8: Feature Engineering with Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_selected)
X_val_poly = poly.transform(X_val_selected)
X_test_poly = poly.transform(X_test_selected)

# Step 9: Model Building - Gradient Boosting Regression with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

gb_reg = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=gb_reg, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=2)
grid_search.fit(X_train_poly, y_train)

best_params = grid_search.best_params_
gb_reg_best = GradientBoostingRegressor(**best_params, random_state=42)
gb_reg_best.fit(X_train_poly, y_train)

# Step 10: Model Evaluation
train_predictions_best = gb_reg_best.predict(X_train_poly)
val_predictions_best = gb_reg_best.predict(X_val_poly)

train_rmse_best = np.sqrt(mean_squared_error(y_train, train_predictions_best))
val_rmse_best = np.sqrt(mean_squared_error(y_val, val_predictions_best))

print("Train RMSE with best parameters:", train_rmse_best)
print("Validation RMSE with best parameters:", val_rmse_best)

# Step 11: Making Predictions
test_predictions_best = gb_reg_best.predict(X_test_poly)

# Step 12: Creating Submission File
submission_df = pd.DataFrame({'Registration Number': test_data['Registration Number'], 'Annual Turnover': test_predictions_best})
submission_df.to_csv('submission_gradient_boosting_with_polyfeatures_and_feature_selection.csv', index=False)




You should consider upgrading via the 'c:\users\rahulbhave\code\gl_hackathon_restaurent_turnover_prediction\env\scripts\python.exe -m pip install --upgrade pip' command.


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=  16.8s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=  17.8s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=100; total time=  17.7s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=200; total time=  33.9s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=200; total time=  36.6s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=200; total time=  35.2s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=300; total time=  50.7s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=300; total time=  52.9s
[CV] END ..learning_rate=0.05, max_depth=3, n_estimators=300; total time=  53.5s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=100; total time=  21.1s
[CV] END ..learning_rate=0.05, max_depth=4, n_estimators=100; total time=  22.4s
[CV] END ..learning_rate=0.05, max_depth=4, n_es