In [40]:
# Installing necessary libraries
! pip install pandas scikit-learn

# Step 2: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

# Step 3: Load the data
# Assuming you have 'train.csv' and 'test.csv' files in your working directory
train_data = pd.read_csv('Train_dataset.csv')
test_data = pd.read_csv('Test_dataset.csv')

# Step 4: Data Preprocessing
X = train_data.drop(columns=['Annual Turnover'])
y = train_data['Annual Turnover']
X_test = test_data.drop(columns=['Registration Number'])

# Step 5: Splitting the Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature Engineering
# Assuming 'Cuisine' is a categorical variable
X_train['Cuisine'] = X_train['Cuisine'].apply(lambda x: x.split(','))
X_val['Cuisine'] = X_val['Cuisine'].apply(lambda x: x.split(','))
X_test['Cuisine'] = X_test['Cuisine'].apply(lambda x: x.split(','))

# Step 7: One-Hot Encoding
cuisines_train = X_train['Cuisine'].apply(pd.Series)
cuisines_val = X_val['Cuisine'].apply(pd.Series)
cuisines_test = X_test['Cuisine'].apply(pd.Series)

X_train = pd.concat([X_train, pd.get_dummies(cuisines_train, prefix='Cuisine')], axis=1).drop(columns=['Cuisine'])
X_val = pd.concat([X_val, pd.get_dummies(cuisines_val, prefix='Cuisine')], axis=1).drop(columns=['Cuisine'])
X_test = pd.concat([X_test, pd.get_dummies(cuisines_test, prefix='Cuisine')], axis=1).drop(columns=['Cuisine'])

# Step 8: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_numeric)
X_val_imputed = imputer.transform(X_val_numeric)
X_test_imputed = imputer.transform(X_test_numeric)

# Step 9: Model Building
model = LinearRegression()
model.fit(X_train_imputed, y_train)

# Step 10: Model Evaluation
train_predictions = model.predict(X_train_imputed)
val_predictions = model.predict(X_val_imputed)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))

print("Train RMSE:", train_rmse)
print("Validation RMSE:", val_rmse)

# Step 11: Making Predictions
test_predictions = model.predict(X_test_imputed)

# Step 12: Creating Submission File
submission_df = pd.DataFrame({'Registration Number': test_data['Registration Number'], 'Annual Turnover': test_predictions})
submission_df.to_csv('submission.csv', index=False)




You should consider upgrading via the 'c:\users\rahulbhave\code\gl_hackathon_restaurent_turnover_prediction\env\scripts\python.exe -m pip install --upgrade pip' command.


Train RMSE: 20684399.429881968
Validation RMSE: 20128061.810373373
