In [23]:
# Step 1: Installing necessary libraries
!pip install pandas scikit-learn

# Step 2: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

# Step 3: Load the data
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')

# Step 4: Data Preprocessing
X = train_data.drop(columns=['Annual Turnover'])
y = train_data['Annual Turnover']
X_test = test_data.drop(columns=['Registration Number'])

# Drop non-important columns if they exist
X = X.drop(columns=['City', 'Cuisine'], errors='ignore')
X_test = X_test.drop(columns=['City', 'Cuisine'], errors='ignore')

# One-hot encode categorical variables
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align the columns of training and test datasets
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

# Step 5: Splitting the Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Impute missing values only for numeric columns
numeric_cols = X_train.select_dtypes(include=['number']).columns
numeric_cols = numeric_cols.drop('Registration Number', errors='ignore')
imputer = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
X_test[numeric_cols] = imputer.transform(X_test[numeric_cols])

# Step 7: Model Building - Gradient Boosting Regression
gb_reg = GradientBoostingRegressor(random_state=42)
gb_reg.fit(X_train, y_train)

# Step 8: Model Evaluation
train_predictions = gb_reg.predict(X_train)
val_predictions = gb_reg.predict(X_val)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))

print("Train RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)

# Step 9: Making Predictions
test_predictions = gb_reg.predict(X_test)

# Step 10: Creating Submission File
submission_df = pd.DataFrame({'Registration Number': test_data['Registration Number'], 'Annual Turnover': test_predictions})
submission_df.to_csv('submission_gradient_boosting.csv', index=False)




You should consider upgrading via the 'c:\users\rahulbhave\code\gl_hackathon_restaurent_turnover_prediction\env\scripts\python.exe -m pip install --upgrade pip' command.


Train RMSE: 13704622.272939535
Validation RMSE: 20610665.3648779
