In [51]:
import pandas as pd
import numpy as np

In [52]:
chipotle = pd.read_csv('chipotle.tsv', sep='\t')
chipotle

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


## 1. Missing Values:


In [53]:
chipotle[chipotle.isna()]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
4617,,,,,
4618,,,,,
4619,,,,,
4620,,,,,


In [54]:
chipotle.loc[chipotle.choice_description.isna(), 'choice_description']

0       NaN
3       NaN
6       NaN
10      NaN
14      NaN
       ... 
4600    NaN
4605    NaN
4613    NaN
4614    NaN
4616    NaN
Name: choice_description, Length: 1246, dtype: object

We are not sure what to impute in missing values so we will remove the misssing values in `choice_description` column

In [55]:
chipotle.dropna(subset='choice_description', inplace=True)

## 2. Data Types:

In [56]:
chipotle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3376 entries, 1 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            3376 non-null   int64 
 1   quantity            3376 non-null   int64 
 2   item_name           3376 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          3376 non-null   object
dtypes: int64(2), object(3)
memory usage: 158.2+ KB


Converting `order_id` col. to string

In [57]:
chipotle.order_id = chipotle.order_id.astype('str')

## 3. Duplicated Entries:

In [58]:
chipotle.duplicated().value_counts()

False    3335
True       41
Name: count, dtype: int64

We have 41 no. o duplicated entries

In [59]:
chipotle[chipotle.duplicated()]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
238,103,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Rice, Black Beans...",$11.75
248,108,1,Canned Soda,[Mountain Dew],$1.09
297,129,1,Steak Burrito,"[Tomatillo Green Chili Salsa, [Rice, Cheese, G...",$11.75
381,165,1,Canned Soft Drink,[Coke],$1.25
484,205,1,Chicken Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",$8.75
567,233,1,Canned Soft Drink,[Diet Coke],$1.25
684,282,1,Canned Soft Drink,[Coke],$1.25
773,319,1,Chicken Bowl,"[Tomatillo-Green Chili Salsa (Medium), Black B...",$8.49
908,376,1,Steak Burrito,"[Roasted Chili Corn Salsa (Medium), [Rice, Faj...",$8.99
924,381,1,Chicken Soft Tacos,"[Tomatillo-Red Chili Salsa (Hot), Cheese]",$8.49


Dropping duplicated rows

In [60]:
chipotle.drop_duplicates(inplace=True)

## 4. Quantity and Item Price:

In [76]:
chipotle.quantity.unique()

array([1, 2, 3, 4], dtype=int64)

In [87]:
chipotle.item_price.unique()

array(['$3.39 ', '$16.98 ', '$10.98 ', '$11.75 ', '$9.25 ', '$8.75 ',
       '$11.25 ', '$8.49 ', '$2.18 ', '$8.99 ', '$1.09 ', '$22.50 ',
       '$11.48 ', '$17.98 ', '$17.50 ', '$1.25 ', '$23.78 ', '$6.49 ',
       '$11.08 ', '$22.16 ', '$32.94 ', '$22.20 ', '$10.58 ', '$2.50 ',
       '$23.50 ', '$7.40 ', '$18.50 ', '$6.78 ', '$11.89 ', '$9.39 ',
       '$3.75 ', '$8.69 ', '$8.19 ', '$35.00 ', '$27.75 ', '$26.25 ',
       '$21.96 ', '$4.36 ', '$22.96 ', '$26.07 ', '$12.98 ', '$35.25 ',
       '$33.75 ', '$16.38 ', '$5.00 ', '$8.50 ', '$11.49 '], dtype=object)

## 5. Choice Description:

In [88]:
chipotle[['choice_description']]

Unnamed: 0,choice_description
1,[Clementine]
2,[Apple]
4,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans..."
5,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou..."
7,"[Tomatillo Red Chili Salsa, [Fajita Vegetables..."
...,...
4617,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ..."
4618,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese..."
4619,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto..."
4620,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu..."


In [90]:
chipotle.choice_description.nunique()

1043

## 6. Handling Special Characters:

In [91]:
chipotle.item_name.unique()

array(['Izze', 'Nantucket Nectar', 'Chicken Bowl', 'Steak Burrito',
       'Steak Soft Tacos', 'Chicken Crispy Tacos', 'Chicken Soft Tacos',
       'Chicken Burrito', 'Canned Soda', 'Barbacoa Burrito',
       'Carnitas Burrito', 'Carnitas Bowl', 'Barbacoa Bowl',
       'Chicken Salad Bowl', 'Steak Bowl', 'Barbacoa Soft Tacos',
       'Veggie Burrito', 'Veggie Bowl', 'Steak Crispy Tacos',
       'Barbacoa Crispy Tacos', 'Veggie Salad Bowl',
       'Carnitas Soft Tacos', 'Chicken Salad', 'Canned Soft Drink',
       'Steak Salad Bowl', '6 Pack Soft Drink', 'Bowl', 'Burrito',
       'Crispy Tacos', 'Carnitas Crispy Tacos', 'Steak Salad',
       'Veggie Soft Tacos', 'Carnitas Salad Bowl', 'Barbacoa Salad Bowl',
       'Salad', 'Veggie Crispy Tacos', 'Veggie Salad', 'Carnitas Salad'],
      dtype=object)

Replacing square brackets with empty string

In [161]:
chipotle.choice_description = chipotle.choice_description.str.replace('[','')
chipotle.choice_description = chipotle.choice_description.str.replace(']','')
chipotle.choice_description.unique()

array(['Clementine', 'Apple',
       'Tomatillo-Red Chili Salsa (Hot), Black Beans, Rice, Cheese, Sour Cream',
       ...,
       'Roasted Chili Corn Salsa, Pinto Beans, Sour Cream, Cheese, Lettuce, Guacamole',
       'Tomatillo Green Chili Salsa, Rice, Black Beans',
       'Tomatillo Green Chili Salsa, Rice, Fajita Vegetables, Black Beans, Guacamole'],
      dtype=object)

Converting `item_price` to a float data type and it represents price in dollars

In [177]:
chipotle.item_price = chipotle.item_price.str[1:].astype('float')

## 7. Order Id Integrity:

Everything looks fine with `order_id`

In [165]:
chipotle[['order_id']]

Unnamed: 0,order_id
1,1
2,1
4,2
5,3
7,4
...,...
4617,1833
4618,1833
4619,1834
4620,1834


## 8. Item Name Standardization:

In [172]:
pd.Series(chipotle.item_name.unique()).sort_values()

25        6 Pack Soft Drink
12            Barbacoa Bowl
9          Barbacoa Burrito
19    Barbacoa Crispy Tacos
33      Barbacoa Salad Bowl
15      Barbacoa Soft Tacos
26                     Bowl
27                  Burrito
8               Canned Soda
23        Canned Soft Drink
11            Carnitas Bowl
10         Carnitas Burrito
29    Carnitas Crispy Tacos
37           Carnitas Salad
32      Carnitas Salad Bowl
21      Carnitas Soft Tacos
2              Chicken Bowl
7           Chicken Burrito
5      Chicken Crispy Tacos
22            Chicken Salad
13       Chicken Salad Bowl
6        Chicken Soft Tacos
28             Crispy Tacos
0                      Izze
1          Nantucket Nectar
34                    Salad
14               Steak Bowl
3             Steak Burrito
18       Steak Crispy Tacos
30              Steak Salad
24         Steak Salad Bowl
4          Steak Soft Tacos
17              Veggie Bowl
16           Veggie Burrito
35      Veggie Crispy Tacos
36             Veggi

## 9. Quantity and Price Relationships:

In [178]:
chipotle[['quantity', 'item_price']]

Unnamed: 0,quantity,item_price
1,1,3.39
2,1,3.39
4,2,16.98
5,1,10.98
7,1,11.75
...,...,...
4617,1,11.75
4618,1,11.75
4619,1,11.25
4620,1,8.75


In [180]:
chipotle.groupby('quantity').agg({'item_price': ['min', 'mean', 'median', 'max', 'count']})

Unnamed: 0_level_0,item_price,item_price,item_price,item_price,item_price
Unnamed: 0_level_1,min,mean,median,max,count
quantity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,1.09,8.71634,8.99,11.89,3150
2,2.18,14.137407,17.5,23.78,162
3,3.75,18.277059,26.07,35.25,17
4,4.36,14.893333,5.0,35.0,6


**From above output we can analyze**:
1. The minimum price is 1.09 dollars per quantity of a item
2. The maximum price is 35.25 dollars per 3 quantity of a item
3. The maximum no. of orders is of single quantity which is total of 3150
4. The miinimum no. of orders is lowest when together ordering 4 quantities of a single item
5. There is no significant difference in average price for the different quantities

## 10. Data Integrity Check:

In [185]:
chipotle.reset_index(drop=True, inplace=True)
chipotle

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Izze,Clementine,3.39
1,1,1,Nantucket Nectar,Apple,3.39
2,2,2,Chicken Bowl,"Tomatillo-Red Chili Salsa (Hot), Black Beans, ...",16.98
3,3,1,Chicken Bowl,"Fresh Tomato Salsa (Mild), Rice, Cheese, Sour ...",10.98
4,4,1,Steak Burrito,"Tomatillo Red Chili Salsa, Fajita Vegetables, ...",11.75
...,...,...,...,...,...
3330,1833,1,Steak Burrito,"Fresh Tomato Salsa, Rice, Black Beans, Sour Cr...",11.75
3331,1833,1,Steak Burrito,"Fresh Tomato Salsa, Rice, Sour Cream, Cheese, ...",11.75
3332,1834,1,Chicken Salad Bowl,"Fresh Tomato Salsa, Fajita Vegetables, Pinto B...",11.25
3333,1834,1,Chicken Salad Bowl,"Fresh Tomato Salsa, Fajita Vegetables, Lettuce",8.75


## 11. Converting to CSV:

In [186]:
chipotle.to_csv('finalized_chipotle_dataset.csv')

## 12. Handling Categorical Data:

In [188]:
chipotle.item_name.unique()

array(['Izze', 'Nantucket Nectar', 'Chicken Bowl', 'Steak Burrito',
       'Steak Soft Tacos', 'Chicken Crispy Tacos', 'Chicken Soft Tacos',
       'Chicken Burrito', 'Canned Soda', 'Barbacoa Burrito',
       'Carnitas Burrito', 'Carnitas Bowl', 'Barbacoa Bowl',
       'Chicken Salad Bowl', 'Steak Bowl', 'Barbacoa Soft Tacos',
       'Veggie Burrito', 'Veggie Bowl', 'Steak Crispy Tacos',
       'Barbacoa Crispy Tacos', 'Veggie Salad Bowl',
       'Carnitas Soft Tacos', 'Chicken Salad', 'Canned Soft Drink',
       'Steak Salad Bowl', '6 Pack Soft Drink', 'Bowl', 'Burrito',
       'Crispy Tacos', 'Carnitas Crispy Tacos', 'Steak Salad',
       'Veggie Soft Tacos', 'Carnitas Salad Bowl', 'Barbacoa Salad Bowl',
       'Salad', 'Veggie Crispy Tacos', 'Veggie Salad', 'Carnitas Salad'],
      dtype=object)

In [190]:
item_map = {'6 Pack Soft Drink': 'Soft Drink Pack',
       'Barbacoa Bowl': 'Barbacoa',
       'Barbacoa Burrito': 'Barbacoa',
       'Bowl': 'Bowl',
       'Burrito': 'Burrito',
       'Canned Soda': 'Canned',
       'Canned Soft Drink': 'Canned',
       'Carnitas Bowl': 'Carnitas',
       'Carnitas Burrito': 'Carnitas',
       'Carnitas Crispy Tacos': 'Carnitas',
       'Carnitas Salad': 'Carnitas',
       'Carnitas Salad Bowl': 'Carnitas',
       'Carnitas Soft Tacos': 'Carnitas',
       'Chicken Bowl': 'Chicken',
       'Chicken Burrito': 'Chicken',
       'Chicken Crispy Tacos': 'Chicken',
       'Chicken Salad': 'Chicken',
       'Chicken Salad Bowl': 'Chicken',
       'Chicken Soft Tacos': 'Chicken',
       'Izze': 'Izze',
       'Nantucket Nectar': 'Nantucket',
       'Salad': 'Salad',
       'Steak Bowl': 'Steak',
       'Steak Burrito': 'Steak',
       'Steak Crispy Tacos': 'Steak',
       'Steak Salad': 'Steak',
       'Steak Salad Bowl': 'Steak',
       'Steak Soft Tacos': 'Steak',
       'Veggie Bowl': 'Veggie',
       'Veggie Burrito': 'Veggie',
       'Veggie Crispy Tacos': 'Veggie',
       'Veggie Salad': 'Veggie',
       'Veggie Salad Bowl': 'Veggie',
       'Veggie Soft Tacos': 'Veggie'}

In [194]:
chipotle = chipotle.assign(item_type = chipotle.item_name.map(item_map))
chipotle

## 13. Consistent Quantity and Price Units:

Already `quantity` is in int64 datatype and `item_price` is in float datatype (represented in dollars)

---
# END OF PROJECT