In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Future #

In [None]:
future = pd.read_csv('../input/restaurant-business-rankings-2020/Future50.csv')
future.head()

## Transforming the data to create new variables ##

### The first thing to do is separate city and state in two new variables ###

In [None]:
future['Location'] = future.Location.apply(lambda x: x.split(', '))
future['State'] = future.Location.str.slice(1,9).sum()
future['City'] = future.Location.str.slice(0,1).sum()
future['State'] = future.State.apply(lambda x: x.replace(" ", ""))
future['City'] = future.City.apply(lambda x: x.replace(" ", ""))
future.drop('Location', axis=1, inplace=True)

In [None]:
future.head()

## Now we can analyze these two variables ##

In [None]:
plt.subplots(figsize=(25,5))

plt.subplot(1,2,2)
future['City'].value_counts().plot(kind='bar').set_title('Cities')

plt.subplot(1,2,1)
future['State'].value_counts().plot(kind='bar').set_title('States')

plt.show()

In [None]:
colors = []

for i in future.groupby('State').Sales.sum():
  if i > future.groupby('State').Sales.sum().mean():
    colors.append('lightgreen')
  else:
    colors.append('lightblue')
    
colors2 = []

for i in future.groupby('City').Sales.sum():
  if i > future.groupby('City').Sales.sum().mean():
    colors2.append('lightgreen')
  else:
    colors2.append('lightblue')

In [None]:
plt.subplots(figsize=(25,5))

plt.subplot(1,2,1)
future.groupby('City').Sales.sum().plot(kind='bar', color=colors2, title='Absolute Sales by Cities')
plt.axhline(future.groupby('City').Sales.sum().mean(), ls='--', color='red', label='Mean')
plt.legend()

plt.subplot(1,2,2)
future.groupby('State').Sales.sum().plot(kind='bar', color=colors, title='Absolute Sales by State')
plt.axhline(future.groupby('State').Sales.sum().mean(), ls='--', color='red', label='Mean')
plt.legend()

plt.show()

In [None]:
colors3 = []

for i in future.groupby('State').Sales.mean():
  if i > future.groupby('State').Sales.mean().mean():
    colors3.append('lightgreen')
  else:
    colors3.append('lightblue')
    
colors4 = []

for i in future.groupby('City').Sales.mean():
  if i > future.groupby('City').Sales.mean().mean():
    colors4.append('lightgreen')
  else:
    colors4.append('lightblue')

In [None]:
plt.subplots(figsize=(25,5))

plt.subplot(1,2,1)
future.groupby('City').Sales.mean().plot(kind='bar', color=colors4, title='Mean Sales by Cities')
plt.axhline(future.groupby('City').Sales.mean().mean(), ls='--', color='red', label='Mean')
plt.legend()

plt.subplot(1,2,2)
future.groupby('State').Sales.mean().plot(kind='bar', color=colors3, title='Mean Sales by State')
plt.axhline(future.groupby('State').Sales.mean().mean(), ls='--', color='red', label='Mean')
plt.legend()

plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
future['Franchising'] = encoder.fit_transform(future['Franchising'])

future['YOY_Sales'] = future['YOY_Sales'].apply(lambda x: x.replace("%",""))
future['YOY_Sales'] = future['YOY_Sales'].astype('float')
future['YOY_Units'] = future['YOY_Units'].apply(lambda x: x.replace("%",""))
future['YOY_Units'] = future['YOY_Units'].astype('float')

In [None]:
future.head()

In [None]:
fig, axs = plt.subplots(4,figsize=(25,5))

plt.subplot(1,2,1)
future[future['State'] == 'N.Y.'].groupby('Restaurant').Sales.sum().plot(kind='bar').set_title('Sales by Restaurant - State of New York')
plt.subplot(1,2,2)
future[future['State'] == 'Calif.'].groupby('Restaurant').Sales.sum().plot(kind='bar').set_title('Sales by Restaurant - State of California')

fig.show()

In [None]:
fig, axs = plt.subplots(4,figsize=(25,5))

plt.subplot(1,2,1)
future[future['State'] == 'Ohio'].groupby('Restaurant').Sales.sum().plot(kind='bar').set_title('Sales by Restaurant - State of Ohio')
plt.subplot(1,2,2)
future[future['State'] == 'Texas'].groupby('Restaurant').Sales.sum().plot(kind='bar').set_title('Sales by Restaurant - State of California')

fig.show()

# Independence #

In [None]:
independence = pd.read_csv('../input/restaurant-business-rankings-2020/Independence100.csv')
independence.head()

In [None]:
independence.isnull().sum()

In [None]:
colors5 = []

for i in independence.groupby(['City','State']).Sales.mean():
  if i > independence.groupby(['City','State']).Sales.mean().mean():
    colors5.append('lightgreen')
  else:
    colors5.append('lightblue')

In [None]:
plt.subplots(figsize=(25,5))
independence.groupby(['City','State']).Sales.mean().plot(kind='bar', color=colors5, ylabel='Sales').set_title("Mean Sales")
plt.axhline(independence.groupby(['City','State']).Sales.mean().mean(), ls='--', color='red')
plt.show()

## Now to create a machine learning model I will analyze the correlation between the variables ##

In [None]:
plt.subplots(figsize=(14,7))

plt.subplot(1,2,1)
sns.scatterplot(x=independence.Sales, y=independence.Rank)
plt.axvline(independence.Sales.mean(), ls='--', color='red', label='Mean')
plt.legend()

plt.subplot(1,2,2)
sns.scatterplot(x=independence.Sales, y=independence['Meals Served'])

plt.show()

In [None]:
sns.heatmap(independence.corr(), annot=True)

### To create the machine learning model to predict Sales I will use RandomForestRegressor ##

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
x = independence.drop(['Restaurant','Sales','City','State'], axis=1)
y = independence['Sales']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

forest = RandomForestRegressor()
forest.fit(x_train, y_train)

predicao = forest.predict(x_test)

print('MAE:', mean_absolute_error(y_test, predicao))
print('MSE:', mean_squared_error(y_test, predicao))
print("R²:", r2_score(y_test, predicao))

In [None]:
print('True values:', y_test.mean())
print("Prediction values:", predicao.mean())
print('Mean difference:', y_test.mean() - predicao.mean())

# Top #

In [None]:
top = pd.read_csv('../input/restaurant-business-rankings-2020/Top250.csv')
top.head()

In [None]:
top.shape

In [None]:
top.isnull().sum()

In [None]:
top.drop(['Content','Headquarters','Restaurant'], axis=1, inplace=True)

In [None]:
top['Segment_Category'].value_counts()

In [None]:
plt.subplots(figsize=(14,7))

plt.subplot(1,2,1)
sns.scatterplot(x=top['Sales'], y=top['Rank']).set_title('Sales')
plt.axvline(top['Sales'].mean(), ls='--', color='red')

plt.subplot(1,2,2)
sns.scatterplot(x=top[top['Segment_Category'] == 'Varied Menu'].Sales, y=top[top['Segment_Category'] == 'Varied Menu'].Rank).set_title('Sales by Varied Menu Segment Category')
plt.axvline(top[top['Segment_Category'] == 'Varied Menu'].Sales.mean(), ls='--', color='red')

fig.show()

In [None]:
top['YOY_Sales'] = top['YOY_Sales'].apply(lambda x: x.replace('%', ""))
top['YOY_Units'] = top['YOY_Units'].apply(lambda x: x.replace("%", ""))

top['YOY_Sales'] = top['YOY_Sales'].astype('float')
top['YOY_Units'] = top['YOY_Units'].astype('float')

In [None]:
top.head()

In [None]:
sns.heatmap(top.corr(), annot=True)

In [None]:
dummie = pd.get_dummies(top['Segment_Category'])
top.drop("Segment_Category", axis=1, inplace=True)
top = pd.concat((top, dummie), axis=1)

In [None]:
top.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
x = top.drop('Sales', axis=1)

y = top['Sales']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

forest = RandomForestRegressor()
forest.fit(x_train, y_train)

predicao2 = forest.predict(x_test)

print("MSE:", mean_squared_error(y_test, predicao2))
print("MAE:", mean_absolute_error(y_test, predicao2))
print("R²:", r2_score(y_test, predicao2))

In [None]:
print('True value:', y_test.mean())
print('Prediction:', predicao2.mean())