## Getting and Knowing your Data

#### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

#### Step 2. Import the dataset from this address.

In [75]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
chipo_df = pd.read_csv(url, sep='\t')
chipo_df.head(5)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


#### Step 3. What is the number of observations in the dataset?

In [4]:
chipo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


In [8]:
print(chipo_df.shape)
print("Rows: ",chipo_df.shape[0])
print("Columns: ", chipo_df.shape[1])

(4622, 5)
Rows:  4622
Columns:  5


#### Step 4. Print the name of all the columns.

In [9]:
chipo_df.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

#### Step 5. How is the dataset indexed?

In [10]:
chipo_df.index

RangeIndex(start=0, stop=4622, step=1)

#### Step 6. Which was the most-ordered item?

In [33]:
(chipo_df.groupby('item_name')['quantity'].sum())[:10]

item_name
6 Pack Soft Drink         55
Barbacoa Bowl             66
Barbacoa Burrito          91
Barbacoa Crispy Tacos     12
Barbacoa Salad Bowl       10
Barbacoa Soft Tacos       25
Bottled Water            211
Bowl                       4
Burrito                    6
Canned Soda              126
Name: quantity, dtype: int64

In [28]:
most_ordered = chipo_df.groupby('item_name').sum().sort_values(['quantity'], ascending=False)
most_ordered[:1]

Unnamed: 0_level_0,order_id,quantity
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicken Bowl,713926,761


#### Step 7. What was the most ordered item in the choice_description column?

In [34]:
choice_desc = chipo_df.groupby('choice_description').sum().sort_values(['quantity'], ascending=False)
choice_desc[:5]

Unnamed: 0_level_0,order_id,quantity
choice_description,Unnamed: 1_level_1,Unnamed: 2_level_1
[Diet Coke],123455,159
[Coke],122752,143
[Sprite],80426,89
"[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream, Lettuce]]",43088,49
"[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream]]",36041,42


#### Step 8. How many items were orderd in total?

In [37]:
total_items_ordered = chipo_df['quantity'].sum()
total_items_ordered

4972

#### Step 9. Turn the item price into a float
![title](slice.png)

In [58]:
# remove dollar symbol
remove_dollar = [x[1:] for x in chipo_df.item_price]
print(remove_dollar[:5])


['2.39 ', '3.39 ', '3.39 ', '2.39 ', '16.98 ']


In [72]:
len(remove_dollar)

4622

In [59]:
chipo_df.item_price = remove_dollar
chipo_df.info()
chipo_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39


In [61]:
type_df = chipo_df.astype({'item_price':float})
type_df.item_price.dtype

dtype('float64')

##### Step 9a.  Change datatype of column Using LAMDA function

In [76]:
dollarizer = lambda x: float(x[1:-1]) # not considering 0th char
chipo_df.item_price = chipo_df.item_price.apply(dollarizer)
print(chipo_df.item_price.dtype)
print(chipo_df.shape)

float64
(4622, 5)


#### Step 10. How much was the revenue for the period in the dataset?

In [80]:
revenue = (chipo_df['quantity'] * chipo_df['item_price']).sum()
print('Revenue was: $' + str(np.round(revenue,2)))

Revenue was: $39237.02


#### Step 11. How many orders were made in the period?

In [69]:
orders = chipo_df['order_id'].value_counts().count()
print("Orders: ", orders)

Orders:  1834


#### Step 12. What is the average revenue amount per order?

In [81]:
# solution 1:
average_revenue = (chipo_df['quantity'] * chipo_df['item_price']).sum()

print('Average revenue was: $' + str(np.round(average_revenue,5)))

Average revenue was: $39237.02


In [85]:
chipo_df['revenue'] = chipo_df['quantity'] * chipo_df['item_price']
order_grouped = chipo_df.groupby(by=['order_id']).sum()
# order_grouped.mean()['revenue']
order_grouped['revenue'].mean()

21.39423118865867

In [86]:
chipo_df.groupby(by=['order_id']).sum().mean()['revenue']

21.39423118865867

#### Step 13. How many different items are sold?

In [88]:
chipo_df['item_name'].value_counts().count()

50

## Step 14. For each item_name, what is mean/average item price?

In [95]:
chipo_df.groupby('item_name').item_price.mean()

item_name
6 Pack Soft Drink                         6.610185
Barbacoa Bowl                            10.187273
Barbacoa Burrito                          9.832418
Barbacoa Crispy Tacos                    10.928182
Barbacoa Salad Bowl                      10.640000
Barbacoa Soft Tacos                      10.018400
Bottled Water                             1.867654
Bowl                                     14.800000
Burrito                                   7.400000
Canned Soda                               1.320577
Canned Soft Drink                         1.457641
Carnitas Bowl                            10.833971
Carnitas Burrito                         10.132712
Carnitas Crispy Tacos                    11.137143
Carnitas Salad                            8.990000
Carnitas Salad Bowl                      11.056667
Carnitas Soft Tacos                       9.398500
Chicken Bowl                             10.113953
Chicken Burrito                          10.082857
Chicken Crispy Tacos 