# Task for Today  

***

## Michelin Restaurant Star Prediction  

Given *data about Michelin starred restaurants*, let's try to predict the **number of stars** of a given restaurant.  
  
We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [None]:
one_star_df = pd.read_csv('../input/michelin-restaurants/one-star-michelin-restaurants.csv')
two_star_df = pd.read_csv('../input/michelin-restaurants/two-stars-michelin-restaurants.csv')
three_star_df = pd.read_csv('../input/michelin-restaurants/three-stars-michelin-restaurants.csv')

In [None]:
one_star_df.head()

In [None]:
two_star_df.head()

In [None]:
three_star_df.head()

# Preprocessing

In [None]:
one_star_df['stars'] = pd.Series(0, index=one_star_df.index)
two_star_df['stars'] = pd.Series(1, index=two_star_df.index)
three_star_df['stars'] = pd.Series(2, index=three_star_df.index)

combined_df = pd.concat([one_star_df, two_star_df, three_star_df], axis=0).sample(frac=1.0).reset_index(drop=True)

In [None]:
combined_df

In [None]:
y = combined_df['stars'].copy()
X = combined_df.drop('stars', axis=1)

## Unneeded Columns

In [None]:
X = X.drop(['name', 'zipCode', 'url'], axis=1)

In [None]:
X

## Missing Values

In [None]:
X.isna().sum()

In [None]:
X['price'].value_counts()

In [None]:
X['price'] = X['price'].fillna(X['price'].mode().values[0])

In [None]:
X.isna().sum()

## Encoding

In [None]:
{column: list(X[column].unique()) for column in X.columns if X.dtypes[column] == 'object'}

In [None]:
price_ordering = ['$', '$$', '$$$', '$$$$', '$$$$$']

X['price'] = X['price'].apply(lambda price: price_ordering.index(price))

In [None]:
X

In [None]:
# Removing zip codes from city column
X['city'] = X['city'].apply(lambda city: re.sub(r' - \d+$', '', city) if str(city) != 'nan' else city)

In [None]:
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [None]:
X = onehot_encode(
    X,
    ['city', 'region', 'cuisine'],
    ['CI', 'RE', 'CU']
)

In [None]:
X

## Scaling/Splitting

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=40)

# Training

In [None]:
models = []
Cs = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

for i in range(len(Cs)):
    model = LogisticRegression(C=Cs[i])
    model.fit(X_train, y_train)
    models.append(model)

# Results

In [None]:
model_acc = [model.score(X_test, y_test) for model in models]

print(f"Model Accuracy (C={Cs[0]}):", model_acc[0])
print(f" Model Accuracy (C={Cs[1]}):", model_acc[1])
print(f"  Model Accuracy (C={Cs[2]}):", model_acc[2])
print(f"   Model Accuracy (C={Cs[3]}):", model_acc[3])
print(f"   Model Accuracy (C={Cs[4]}):", model_acc[4])
print(f"  Model Accuracy (C={Cs[5]}):", model_acc[3])
print(f" Model Accuracy (C={Cs[6]}):", model_acc[4])

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/Nm8FO8_yHUI