In [90]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [91]:
data = pd.read_csv('./ms_annual_data_2022.csv')

# filter data
columns_to_include = ['restaurant', 'calories', 'total_fat', 'cholesterol', 'sodium', 'carbohydrates', 'dietary_fiber','sugar', 'protein']
data = data[columns_to_include]
data = data.dropna()
data.to_csv('output.csv', index=False)

# have to reset the index after dropping the null rows or else next cell will error
data = data.reset_index(drop=True)
print(data)

       restaurant calories  total_fat cholesterol sodium carbohydrates  \
0      Applebee's      370       22.0          55   1250            26   
1      Applebee's      220       12.0          25   1270            22   
2      Applebee's      280       15.0          35    930            26   
3      Applebee's      130        7.0          10    230            14   
4      Applebee's     1560      103.0          65   1610           120   
...           ...      ...        ...         ...    ...           ...   
23725     Zaxby's      890       60.0         360   2910             9   
23726     Zaxby's     1150       62.0         315   6540            48   
23727     Zaxby's      680       33.0         100   2570            49   
23728     Zaxby's      780       45.0         115   3180            47   
23729     Zaxby's      160        0.0           0    150            40   

      dietary_fiber sugar protein  
0                 2     9      16  
1                 2     9       5  
2  

  data = pd.read_csv('./ms_annual_data_2022.csv')


In [92]:
def clean_monetary_value(data):
    if isinstance(data, str):
        cleaned_data = data.replace('$', '').strip()
        cleaned_data = clean_string_with_comma(cleaned_data)
        return float(cleaned_data)
    return data
    
def clean_string_with_comma(data):
    if isinstance(data, str):
        cleaned_data = data.replace(',', '')
        return float(cleaned_data)
    return data

def clean_percentage(data):
    if isinstance(data, str):
        return float(data.strip('%'))
    return data

def clean_restaurant_name(data):
    return data.replace('–', '')


In [93]:
# clean data

data = data.apply(clean_percentage)
data = data.apply(clean_monetary_value)
data = data.apply(clean_string_with_comma)
data = data.apply(clean_restaurant_name)

In [94]:
print(data.columns)

X = data.drop(columns=['restaurant'])
y = data['restaurant']
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=1 )
knn = KNeighborsClassifier(n_neighbors=30)

knn.fit(x_train,y_train)
knn.predict(x_test)


Index(['restaurant', 'calories', 'total_fat', 'cholesterol', 'sodium',
       'carbohydrates', 'dietary_fiber', 'sugar', 'protein'],
      dtype='object')


ValueError: could not convert string to float: ''

In [None]:

# visualization
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

clf = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=11))]
)

from sklearn.inspection import DecisionBoundaryDisplay

_, axs = plt.subplots(ncols=2, figsize=(12, 5))

for ax, weights in zip(axs, ("uniform", "distance")):
    clf.set_params(knn__weights=weights).fit(x_train, y_train)
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        x_test,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=data.feature_names[0],
        ylabel=data.feature_names[1],
        shading="auto",
        alpha=0.5,
        ax=ax,
    )
    scatter = disp.ax_.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors="k")
    disp.ax_.legend(
        scatter.legend_elements()[0],
        data.target_names,
        loc="lower left",
        title="Classes",
    )
    _ = disp.ax_.set_title(
        f"3-Class classification\n(k={clf[-1].n_neighbors}, weights={weights!r})"
    )

plt.show()