In [None]:
# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# read the data
df = pd.read_csv('foodhub_order.csv')

# returns the first 5 rows
df.head()


In [None]:
# Find number of rows and columns
df.shape


In [None]:
(1898, 9)

In [None]:
# Use info() to print a concise summary of the DataFrame
df.info()


In [None]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB


In [None]:
#Summary of data
df.info()

#Checking for any missing values and summing them up for each row
missing_data = df.isnull().sum()
print(missing_data)


In [None]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB
order_id                 0
customer_id              0
restaurant_name          0
cuisine_type             0
cost_of_the_order        0
day_of_the_week          0
rating                   0
food_preparation_time    0
delivery_time            0
dtype: int64


In [None]:
#statistical summary of the data including min, max,and mean. Transposed for ease of reading
df.describe(include='all').T


In [None]:
#summary of data for the column rating
df['rating'].describe(include='all').T


In [None]:
count          1898
unique            4
top       Not given
freq            736
Name: rating, dtype: object

In [None]:
#Loop through the colums in the dataset
for column in df.columns:
    if column == 'cuisine_type': #bar graph of cuisine types to see how frequent they appear
        sns.countplot(data=df, x='cuisine_type')
        plt.xticks(rotation=90);
        plt.show()
    elif column == 'cost_of_the_order': #histogram to see what prices are most common
        sns.histplot(data=df, x='cost_of_the_order', kde=True)
        plt.xticks(rotation=90);
        plt.show()
    elif column == 'day_of_the_week': #bar graph to see popularity of weekdat/weekend orders
        sns.countplot(data=df, x='day_of_the_week')
        plt.show()
    elif column == 'rating': #histogram to see what the most common rating is
        sns.histplot(data=df, x='rating')
        plt.show()
    elif column == 'food_preparation_time': #histogram to get a better idea of the common order times
        sns.histplot(data=df, x='food_preparation_time')
        plt.show()
    elif column == 'delivery_time': #box plot to see common delivery times and how skewed the data is with it
        sns.boxplot(data=df, x='delivery_time')
        plt.show()


In [None]:
#counting the top 5 restaurants based on number of orders received
df['restaurant_name'].value_counts().head(5)


In [None]:
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: restaurant_name, dtype: int64

In [None]:
#sorting out the weekend from the data for analysis
df_weekend_popular = df[df['day_of_the_week'] == 'Weekend']

#finding the most popular cuisine on weekends from previous sorted data
df_weekend_popular['cuisine_type'].value_counts()


In [None]:
American          415
Japanese          335
Italian           207
Chinese           163
Mexican            53
Indian             49
Mediterranean      32
Middle Eastern     32
Thai               15
French             13
Korean             11
Southern           11
Spanish            11
Vietnamese          4
Name: cuisine_type, dtype: int64

In [None]:
#saving the rows of the data to a variable
total = df.shape[0]

#sorting out the cost_of_the_order data that is above $20 and adding them together
above_20 = (df['cost_of_the_order']>20).sum()

#finding the percentage of orders that are above $20 
percentage_above_20 = round((above_20/total)*100,2)
print('The percentage above $20:', str(percentage_above_20) +'%')


In [None]:
The percentage above $20: 29.24%


In [None]:
#sorting out delivery times and finding the mean 
mean_delivery = round(df['delivery_time'].mean(),2)
print('The mean delivery time is:', mean_delivery, 'minutes') 


In [None]:
The mean delivery time is: 24.16 minutes


In [None]:
#sorting the customer_id and finding the top 3 most frequent customers
df['customer_id'].value_counts().head(3)


In [None]:
52832    13
47440    10
83287     9
Name: customer_id, dtype: int64

In [None]:
#comparing cuising type vs. the cost of the order
sns.boxplot(data=df, x='cuisine_type', y='cost_of_the_order')
plt.xticks(rotation=90);
plt.show()

#comparing the cuisine type vs. rating
df['rating'] = pd.to_numeric(df['rating'], errors='coerce') 
df.dropna(subset=['rating'], inplace=True) 
sns.pointplot(data=df, x='cuisine_type', y='rating')
plt.xticks(rotation=90)
plt.show()

#comparing the rating vs. prep time
sns.pointplot(data=df, x='food_preparation_time', y='rating')
plt.show()

#comparing day of the week vs. delivery time
sns.boxplot(data=df, x='day_of_the_week', y='delivery_time')
plt.show()

# comparing the rating vs. cost of the order
sns.boxplot(data=df, x='rating', y='cost_of_the_order')
plt.show()


In [None]:
#excluding rows where rating is not given
df_rate = df[df['rating'] != 'Not given'].copy()

#converting rating column to numeric
df_rate['rating'] = pd.to_numeric(df_rate['rating'], errors='coerce')

#dropping any Nan values from the conversion
df_rate.dropna(subset=['rating'], inplace=True)

#group by 'restaurant_name' to get the rating count and reset the index to make it a DataFrame
rating_count = df_rate.groupby('restaurant_name')['rating'].count().reset_index(name='rating_count')

#group by 'restaurant_name' to get the average rating and reset the index to make it a DataFrame
average_rating = df_rate.groupby('restaurant_name')['rating'].mean().reset_index(name='average_rating')

#merge the two DataFrames on 'restaurant_name'
summary = rating_count.merge(average_rating, on='restaurant_name')

#filter for rating more than 50 and average more than 4
eligible_restaurants = summary[(summary['rating_count'] > 50) & (summary['average_rating'] > 4)]

print(eligible_restaurants)


In [None]:
               restaurant_name  rating_count  average_rating
16   Blue Ribbon Fried Chicken            64        4.328125
17           Blue Ribbon Sushi            73        4.219178
117                Shake Shack           133        4.278195
132          The Meatball Shop            84        4.511905


In [None]:
#function for finding revenue
def revenue(i):
    if i > 20:          #taking 25% from orders greater than $20
        return i * 0.25 
    elif i > 5:         #taking 15% from orders greater than $5
        return i * 0.15
    else:               #taking none from orders not within those parameters 
        return i * 0

total = df['cost_of_the_order'].apply(revenue).sum() #summing up the amount from the defined function
print('The total revenue generated is:',round(total,2), 'dollars')
        


In [None]:
The total revenue generated is: 3865.57 dollars


In [None]:
#taking the rows of the data and saving to a variable
total_orders = df.shape[0]
#combining food prep time and delivery time
df['total_time'] = df['food_preparation_time'] + df['delivery_time']
#counting how many take longer than 60 mintues
df_greater_60 = (df['total_time'] > 60).sum()
#finding what percent takes longer than 60 minutes
percent_greater_60 = (df_greater_60/total_orders)*100
print('The percent that takes longer than 60 minutes is:',str(round(percent_greater_60,2))+'%')


In [None]:
The percent that takes longer than 60 minutes is: 10.24%


In [None]:
#mean of the weekday delivery time
weekday_mean = round(df[df['day_of_the_week'] == 'Weekday']['delivery_time'].mean())
print('The mean delivery time on the weekday is', weekday_mean,'minutes')

#mean of the weekend delivery time
weekend_mean = round(df[df['day_of_the_week'] == 'Weekend']['delivery_time'].mean())
print('The mean delivery time on the weekend is', weekend_mean,'minutes')

#percent difference between the weekday and weekend delivery time
percent_difference = (weekday_mean-weekend_mean)/((weekday_mean+weekend_mean)/2) *100
print('The percent difference between the two is', str(percent_difference)+'%')


In [None]:
The mean delivery time on the weekday is 28 minutes
The mean delivery time on the weekend is 22 minutes
The percent difference between the two is 24.0%
