In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv("data.csv")
print(data.head())

   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   
3      SM 

In [3]:
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(include=['number']).columns

In [4]:
categorical_imputer = SimpleImputer(strategy='most_frequent')
numerical_imputer = SimpleImputer(strategy='mean')

In [5]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

In [6]:
scaler = StandardScaler()

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_imputer, numerical_columns),
        ('cat', categorical_imputer, categorical_columns)
    ], remainder='passthrough')

In [8]:
data_transformed = preprocessor.fit_transform(data)

In [9]:
numerical_data = scaler.fit_transform(data[numerical_columns])
categorical_data = one_hot_encoder.fit_transform(data[categorical_columns]).toarray()
preprocessed_data = np.hstack((numerical_data, categorical_data))

In [10]:
target_column = "Cuisines"
X = preprocessed_data
y = data[target_column]

In [11]:
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

In [12]:
unique_counts = pd.Series(y).value_counts()
print("Class distribution before handling rare classes:\n", unique_counts)

threshold = 2
rare_classes = unique_counts[unique_counts < threshold].index
data[target_column] = data[target_column].apply(lambda x: 'Other' if target_encoder.transform([x])[0] in rare_classes else x)

y = target_encoder.fit_transform(data[target_column])

unique_counts = pd.Series(y).value_counts()
print("Class distribution after handling rare classes:\n", unique_counts)

Class distribution before handling rare classes:
 1306    936
1329    511
497     354
828     354
1514    334
       ... 
225       1
1548      1
599       1
200       1
1110      1
Name: count, Length: 1826, dtype: int64
Class distribution after handling rare classes:
 471    1278
373     936
380     511
129     354
225     354
       ... 
314       2
73        2
110       2
540       2
489       2
Name: count, Length: 549, dtype: int64


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
min_class_samples = unique_counts.min()
if min_class_samples < 2:
    print("Warning: Some classes still have fewer than 2 samples, consider removing them entirely.")

In [16]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.83      0.83      0.83         6
           3       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1
           9       1.00      0.33      0.50         3
          10       0.00      0.00      0.00         1
          12       1.00      0.50      0.67         4
          13       1.00      1.00      1.00         2
          14       0.93      1.00      0.96        13
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         2
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
