<a href="https://colab.research.google.com/github/r-autowired/AIMLWorks/blob/main/Notes/ML_Models/RestaurantTurnOver_Hacked_MOD2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 300)

# to split the data into train and test
from sklearn.model_selection import train_test_split

# to build linear regression_model
from sklearn.linear_model import LinearRegression

# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

# to suppress warnings
import warnings

warnings.filterwarnings("ignore")

In [2]:
# loading the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
data = pd.read_csv("/content/drive/MyDrive/AIML_UT/Colab_Files/Pi2_ML/W1_Linear/Hacky/Train_dataset.csv")

In [9]:
city_corrections = {
    'bengaluru': 'bangalore',
    'banglore': 'bangalore',
    'gandhi nagar': 'gandhinagar',
    'navi mumbai': 'mumbai',
    'new delhi': 'delhi',
    'greater noida': 'noida',
    'ncr':'noida',
    'nouda':'noida',
    'pondi': 'pondicherry',
    'pondy': 'pondicherry',
    'punr': 'pune',
    'siliguri': 'singaruli',
    'una': 'unnao',
    'trivandrum': 'thiruvananthapuram',
    'technopark, trivandrum': 'thiruvananthapuram',
    'tirupathi': 'tirupati',
    'gurgoan': 'gurgaon',
    'kochi/cochin': 'cochin',
    'bankura': 'bangalore',
    'kochi': 'cochin',
    'am': 'ambala',
    'bhubaneshwar': 'bhubaneswar',
    'bhubneshwar': 'bhubaneswar',
    'gajiabaad': 'ghaziabad',
    'gaziabaad':"ghaziabad",
    'gurga':'gurgaon',
    'hderabad': 'hyderabad',
    'hyderabad(bhadurpally)': 'hyderabad',
    'muzzafarpur': 'muzaffarpur',
    'indirapuram, ghaziabad': 'ghaziabad',
    'nasikcity': 'nashik',
    'vizag':'visakhapatnam',
    'vsakhapttnam': 'visakhapatnam',
    'kolkata`': 'kolkata',
    # Add more as needed
}

# Function to clean city names
def clean_cities(city_str, corrections):
    if pd.isna(city_str):
        return city_str
    # Split by comma or ampersand, clean each city, then join back
    #cities = [city.strip() for city in city_str.replace('&', ',').split(',')]
    #cities = city_str.strip()
    corrected_cities = corrections.get(city_str, city_str)
    return corrected_cities

In [15]:
# Preprocessing
df_train = data.copy()
df_train['City'] = df_train['City'].fillna('Other')
df_train['Cuisine'] = df_train['Cuisine'].fillna('Other')


Cuisine_1 =[]
Cuisine_2 = []
for cus in df_train['Cuisine']:
    list_cus = cus.split(',')
    Cuisine_1.append(list_cus[0])
    Cuisine_2.append(list_cus[1])

df_train['Cuisine_1'] = Cuisine_1
df_train['Cuisine_2'] = Cuisine_2


df_train['Restaurant Theme'] = df_train['Restaurant Theme'].fillna('Other')
for col in ['Facebook Popularity Quotient', 'Instagram Popularity Quotient', 'Overall Restaurant Rating', 'Ambience',
            'Fire Audit', 'Liquor License Obtained', 'Restaurant Zomato Rating', 'Order Wait Time',
            'Staff Responsivness', 'Value for Money', 'Hygiene Rating', 'Food Rating', 'Lively',
            'Service', 'Comfortablility', 'Privacy']:
    if col in df_train.columns:
        df_train[col] = df_train[col].fillna(df_train[col].median())
for col in ['Live Music Rating', 'Comedy Gigs Rating']:
    if col in df_train.columns:
        df_train[col] = df_train[col].fillna(0)
        df_train[f'Has_{col.split()[0]}_{col.split()[1]}'] = df_train[col].notna().astype(int)

df_train['Opening Day of Restaurant'] = pd.to_datetime(df_train['Opening Day of Restaurant'], errors='coerce')
median_date = df_train['Opening Day of Restaurant'].dropna().median()
df_train['Opening Day of Restaurant'] = df_train['Opening Day of Restaurant'].fillna(median_date)
reference_date = pd.to_datetime('2025-03-29')
df_train['Days_Since_Opening'] = (reference_date - df_train['Opening Day of Restaurant']).dt.days.astype(float)
df_train = df_train.drop(columns=['Opening Day of Restaurant'])

### Making cities names all lower cases
df_train['City'] = df_train['City'].str.strip()
df_train['City'] = df_train['City'].str.lower()
# Clean the 'Cities' column (replace 'Cities' with your actual column name)
df_train['City'] = df_train['City'].apply(lambda x: clean_cities(x, city_corrections))

#df_train['Opening Day of Restaurant'] = pd.to_datetime(df_train['Opening Day of Restaurant'], errors='coerce').fillna(df_train['Opening Day of Restaurant'].dropna().median())
#df_train['Days_Since_Opening'] = (pd.to_datetime('2025-03-29') - df_train['Opening Day of Restaurant']).dt.days.astype(float)

# Derived features
df_train['Overall_Rating_Squared'] = df_train['Overall Restaurant Rating'] ** 2
df_train['Ambience_Squared'] = df_train['Ambience'] ** 2
df_train['Social_Media_Interaction'] = df_train['Facebook Popularity Quotient'] * df_train['Instagram Popularity Quotient']
df_train['Entertainment_Score'] = df_train['Live Music Rating'] + df_train['Comedy Gigs Rating']


df_train.drop(['Live Music Rating'], axis=1, inplace=True)
df_train.drop(['Comedy Gigs Rating'], axis=1, inplace=True)
df_train.drop(['Value Deals Rating'], axis=1, inplace=True)
df_train.drop(['Registration Number'], axis=1, inplace=True)
df_train.drop(['Resturant Tier'], axis=1, inplace=True)
df_train.drop(['Live Sports Rating'], axis=1, inplace=True)
df_train.drop(['Facebook Popularity Quotient', 'Instagram Popularity Quotient'], axis=1, inplace=True)
df_train.drop(['Overall Restaurant Rating', 'Ambience'], axis=1, inplace=True)
df_train.drop(['Cuisine'], axis=1, inplace=True)
#df_train.drop(['Has_Live_Music', 'Has_Comedy_Gigs'], axis=1, inplace=True)


# New numeric columns (assuming present)
for col in ['Fire Audit', 'Liquor License Obtained', 'Restaurant Zomato Rating', 'Order Wait Time', 'Staff Responsivness', 'Value for Money', 'Hygiene Rating', 'Food Rating', 'Lively', 'Service', 'Comfortablility', 'Privacy']:
    if col in df_train.columns:
        df_train[col] = df_train[col].fillna(df_train[col].median())
threshold = 20
for col in ['City', 'Cuisine_1', 'Cuisine_2', 'Restaurant Theme']:
    counts = df_train[col].value_counts()
    rare = counts[counts < threshold].index
    df_train[col] = df_train[col].replace(rare, 'Other_' + col)

# One-hot encode
df_train = pd.get_dummies(df_train, columns=['City', 'Cuisine_1', 'Cuisine_2', 'Restaurant Theme', 'Restaurant Type', 'Endorsed By', 'Restaurant Location'],
                          prefix=['City', 'Cuisine_1', 'Cuisine_2', 'Theme', 'Type', 'Endorsed', 'Location'], drop_first=True)

# Drop 'Restaurant ID'
if 'Restaurant ID' in df_train.columns:
    df_train = df_train.drop(columns=['Restaurant ID'])

# Log-transform target
df_train['Log_Annual_Turnover'] = np.log1p(df_train['Annual Turnover'])

# Features and target
X = df_train.drop(columns=['Annual Turnover', 'Log_Annual_Turnover', 'Opening Day of Restaurant'], errors='ignore')
y = df_train['Log_Annual_Turnover']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Step 2: Train Random Forest ---
rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=10, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_train_pred = np.expm1(rf_model.predict(X_train))
y_test_pred = np.expm1(rf_model.predict(X_test))
train_rmse = np.sqrt(mean_squared_error(np.expm1(y_train), y_train_pred))
test_rmse = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred))
train_r2 = r2_score(np.expm1(y_train), y_train_pred)
test_r2 = r2_score(np.expm1(y_test), y_test_pred)

print(f"Random Forest - Train RMSE: {train_rmse:.2f}")
print(f"Random Forest - Test RMSE: {test_rmse:.2f}")
print(f"Random Forest - Train R-squared: {train_r2:.4f}")
print(f"Random Forest - Test R-squared: {test_r2:.4f}")

# Feature importance
importances = rf_model.feature_importances_

for name, imp in sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{name}: {imp:.4f}")

# --- Step 3: Predict on Test Data ---
test_data = pd.read_csv("/content/drive/MyDrive/AIML_UT/Colab_Files/Pi2_ML/W1_Linear/Hacky/Test_dataset.csv")
df_test = test_data.copy()
# Preprocess test data
df_test['City'] = df_test['City'].fillna('Other')
df_test['Cuisine'] = df_test['Cuisine'].fillna('Other')

test_Cuisine_1 =[]
test_Cuisine_2 = []
for cus in df_test['Cuisine']:
    list_cus1 = cus.split(',')
    test_Cuisine_1.append(list_cus1[0])
    test_Cuisine_2.append(list_cus1[1])

df_test['test_Cuisine_1'] = test_Cuisine_1
df_test['test_Cuisine_2'] = test_Cuisine_2


df_test['Restaurant Theme'] = df_test['Restaurant Theme'].fillna('Other')
for col in ['Facebook Popularity Quotient', 'Instagram Popularity Quotient', 'Overall Restaurant Rating', 'Ambience',
            'Fire Audit', 'Liquor License Obtained', 'Restaurant Zomato Rating', 'Order Wait Time',
            'Staff Responsivness', 'Value for Money', 'Hygiene Rating', 'Food Rating', 'Lively',
            'Service', 'Comfortablility', 'Privacy']:
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(df_test[col].median())
for col in ['Live Music Rating', 'Comedy Gigs Rating']:
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(0)
        df_test[f'Has_{col.split()[0]}_{col.split()[1]}'] = df_test[col].notna().astype(int)
#df_test['Opening Day of Restaurant'] = pd.to_datetime(df_test['Opening Day of Restaurant'], errors='coerce').fillna(df_train['Opening Day of Restaurant'].dropna().median())
#df_test['Opening Day of Restaurant'] = pd.to_datetime(df_test['Opening Day of Restaurant'], errors='coerce')
df_test['Opening Day of Restaurant'] = pd.to_datetime(df_test['Opening Day of Restaurant'], errors='coerce')
median_date1 = df_test['Opening Day of Restaurant'].dropna().median()
df_test['Opening Day of Restaurant'] = df_test['Opening Day of Restaurant'].fillna(median_date1)
reference_date1 = pd.to_datetime('2025-03-29')
df_test['Days_Since_Opening'] = (reference_date1 - df_test['Opening Day of Restaurant']).dt.days.astype(float)
df_test = df_test.drop(columns=['Opening Day of Restaurant'])

# Derived features
df_test['Overall_Rating_Squared'] = df_test['Overall Restaurant Rating'] ** 2
df_test['Ambience_Squared'] = df_test['Ambience'] ** 2
df_test['Social_Media_Interaction'] = df_test['Facebook Popularity Quotient'] * df_test['Instagram Popularity Quotient']
df_test['Entertainment_Score'] = df_test['Live Music Rating'] + df_test['Comedy Gigs Rating']

### Making cities names all lower cases
df_test['City'] = df_test['City'].str.strip()
df_test['City'] = df_test['City'].str.lower()
# Clean the 'Cities' column (replace 'Cities' with your actual column name)
df_test['City'] = df_test['City'].apply(lambda x: clean_cities(x, city_corrections))


df_test.drop(['Live Music Rating'], axis=1, inplace=True)
df_test.drop(['Comedy Gigs Rating'], axis=1, inplace=True)
df_test.drop(['Value Deals Rating'], axis=1, inplace=True)
df_test.drop(['Registration Number'], axis=1, inplace=True)
df_test.drop(['Resturant Tier'], axis=1, inplace=True)
df_test.drop(['Live Sports Rating'], axis=1, inplace=True)
df_test.drop(['Facebook Popularity Quotient', 'Instagram Popularity Quotient'], axis=1, inplace=True)
df_test.drop(['Overall Restaurant Rating', 'Ambience'], axis=1, inplace=True)
df_test.drop(['Cuisine'], axis=1, inplace=True)
#df_test.drop(['Has_Live_Music', 'Has_Comedy_Gigs'], axis=1, inplace=True)


# New numeric columns (assuming present)
for col in ['Fire Audit', 'Liquor License Obtained', 'Restaurant Zomato Rating', 'Order Wait Time', 'Staff Responsivness', 'Value for Money', 'Hygiene Rating', 'Food Rating', 'Lively', 'Service', 'Comfortablility', 'Privacy']:
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(df_test[col].median())

# Group rare categories (use training threshold)
for col in ['City', 'test_Cuisine_1', 'test_Cuisine_2', 'Restaurant Theme']:
    counts = df_test[col].value_counts()
    rare = counts[counts < threshold].index
    df_test[col] = df_test[col].apply(lambda x: x if x in counts and counts[x] >= threshold else 'Other_' + col)

# One-hot encode and align
df_test = pd.get_dummies(df_test, columns=['City','test_Cuisine_1', 'test_Cuisine_2', 'Restaurant Theme', 'Restaurant Type', 'Endoresed By', 'Restaurant Location'],
                         prefix=['City', 'test_Cuisine_1', 'test_Cuisine_2', 'Theme', 'Type', 'Endorsed', 'Location'], drop_first=True)
missing_cols = set(X_train.columns) - set(df_test.columns)
for col in missing_cols:
    df_test[col] = 0
df_test = df_test[X_train.columns]

# Predict
predictions = np.expm1(rf_model.predict(df_test))
print("Sample predictions:", predictions[:10])
print("Mean prediction:", predictions.mean())

# Save


solution_df = pd.DataFrame(test_data['Registration Number'])
solution_df['Annual Turnover'] = predictions
solution_df

import os
os.chdir('/content/drive/MyDrive/AIML_UT/Colab_Files/Pi2_ML/W1_Linear/Hacky')
# Get the current working directory
current_dir = os.getcwd()
print("Current working directory:", current_dir)
solution_df.to_csv('SubmissionMod3.csv',index=False)


Random Forest - Train RMSE: 15390621.25
Random Forest - Test RMSE: 19648321.40
Random Forest - Train R-squared: 0.4983
Random Forest - Test R-squared: 0.1509
Days_Since_Opening: 0.1397
Hygiene Rating: 0.1377
Social_Media_Interaction: 0.1369
Service: 0.0333
Entertainment_Score: 0.0332
Ambience_Squared: 0.0328
Comfortablility: 0.0292
Order Wait Time: 0.0282
Lively: 0.0268
Privacy: 0.0244
Sample predictions: [24248285.66351976 30604208.08673704 28375324.21075739 47191345.13633797
 39978513.0059566  30818611.00789102 35001107.44878983 26035370.09838893
 19596022.70696332 22421830.06485058]
Mean prediction: 26836990.514901232
Current working directory: /content/drive/MyDrive/AIML_UT/Colab_Files/Pi2_ML/W1_Linear/Hacky
