In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.DataFrame({"numbers": ["#23", "#24", "#18", "#14", "#12", "#10", "#35"],
                   "nums": ["23", "24", "18", "14", np.nan, "XYZ", "35"],
                   "colors": ["green", "red", "yellow", "orange", "purple", "blue", "pink"],
                   "other_column": [0, 1, 0, 2, 1, 0, 2]})

In [24]:
df["numbers_str"] = df["numbers"].str.replace("#", "")
df

Unnamed: 0,numbers,nums,colors,other_column,numbers_str
0,#23,23,green,0,23
1,#24,24,red,1,24
2,#18,18,yellow,0,18
3,#14,14,orange,2,14
4,#12,,purple,1,12
5,#10,XYZ,blue,0,10
6,#35,35,pink,2,35


## Exercise 1
Convert the string below into a number.



In [1]:
c2n = "#39"
int(c2n.replace("#", ""))

39

## Exercise 2
Make a new column called colors_upper that contains the elements of colors with all uppercase letters.

In [25]:
df['colors_upper'] = df["colors"].str.capitalize()
df

Unnamed: 0,numbers,nums,colors,other_column,numbers_str,colors_upper
0,#23,23,green,0,23,Green
1,#24,24,red,1,24,Red
2,#18,18,yellow,0,18,Yellow
3,#14,14,orange,2,14,Orange
4,#12,,purple,1,12,Purple
5,#10,XYZ,blue,0,10,Blue
6,#35,35,pink,2,35,Pink


## Exercise 3
Convert the column "nums" to a numeric type using pd.to_numeric and save it to the DataFrame as "nums_tonumeric".

Notice that there is a missing value, and a value that is not a number.

Look at the documentation for pd.to_numeric and think about how to overcome this.

Think about why this could be a bad idea of used without knowing what your data looks like. (Think about what happens when you apply it to the "numbers" column before replacing the "#".)

In [27]:
df.dtypes


numbers            object
nums               object
colors             object
other_column        int64
numbers_str        object
colors_upper       object
nums_tonumeric    float64
dtype: object

In [26]:
df['nums_tonumeric'] = pd.to_numeric(df['nums'], errors='coerce')
df

Unnamed: 0,numbers,nums,colors,other_column,numbers_str,colors_upper,nums_tonumeric
0,#23,23,green,0,23,Green,23.0
1,#24,24,red,1,24,Red,24.0
2,#18,18,yellow,0,18,Yellow,18.0
3,#14,14,orange,2,14,Orange,14.0
4,#12,,purple,1,12,Purple,
5,#10,XYZ,blue,0,10,Blue,
6,#35,35,pink,2,35,Pink,35.0


Applying pd.to_numeric to the "numbers" column without first removing the "#" symbol would result in a ValueError, because the values in the column are not numeric.

It's a bad idea to use pd.to_numeric without knowing what your data looks like, because it can result in unexpected behavior. For example, if there are invalid entries in the data, they will be replaced with NaN values, which can affect the results of any operations performed on the DataFrame. Additionally, if the data has a different data type than expected, the conversion may not work as desired, leading to unexpected results.

## Exercise 4


In [7]:
url = "https://datascience.quantecon.org/assets/data/chipotle_raw.csv.zip"
chipotle = pd.read_csv(url)
chipotle.head()

Unnamed: 0.1,Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,1,Izze,[Clementine],$3.39
2,2,1,1,Nantucket Nectar,[Apple],$3.39
3,3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


'[Diet Coke]'

We’d like you to use this data to answer the following questions.

What is the average price of an item with chicken?

What is the average price of an item with steak?

Did chicken or steak produce more revenue (total)?

How many missing items are there in this dataset? How many missing items in each column?

* *Hint*: Before you will be able to do any of these things you will need to make sure the item_price column has a numeric dtype (probably float).

To answer these questions, we need to do a few preprocessing steps:


In [8]:
# 1. Clean the "item_price" column to make sure it has a numeric data type:
chipotle['item_price'] = pd.to_numeric(chipotle['item_price'].str.replace('$', ''), errors='coerce')



  chipotle['item_price'] = pd.to_numeric(chipotle['item_price'].str.replace('$', ''), errors='coerce')


In [16]:
# 2. Create a new DataFrame that only contains rows with "chicken" or "steak" in the "item_name" column:
chipotle_meat = chipotle[(chipotle['item_name'] == 'Chicken Bowl') | (chipotle['item_name'] == 'Steak Bowl')]
chipotle_meat

Unnamed: 0.1,Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
4,4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
13,13,7,1,Chicken Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",11.25
19,19,10,1,Chicken Bowl,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",8.75
26,26,13,1,Chicken Bowl,"[Roasted Chili Corn Salsa (Medium), [Pinto Bea...",8.49
...,...,...,...,...,...,...
4590,4590,1825,1,Chicken Bowl,"[Roasted Chili Corn Salsa, [Rice, Black Beans,...",11.25
4591,4591,1825,1,Chicken Bowl,"[Tomatillo Red Chili Salsa, [Rice, Black Beans...",8.75
4595,4595,1826,1,Chicken Bowl,"[Tomatillo Green Chili Salsa, [Rice, Black Bea...",8.75
4599,4599,1827,1,Chicken Bowl,"[Roasted Chili Corn Salsa, [Cheese, Lettuce]]",8.75


In [17]:
# 3. Calculate the average price of an item with chicken:
avg_price_chicken = chipotle_meat[chipotle_meat['item_name'] == 'Chicken Bowl']['item_price'].mean()
avg_price_chicken

10.113953168044079

In [18]:
# 4. Calculate the average price of an item with steak:
avg_price_steak = chipotle_meat[chipotle_meat['item_name'] == 'Steak Bowl']['item_price'].mean()
avg_price_steak

10.711800947867296

In [62]:
# 5. Calculate the total revenue generated by chicken and steak:
total_revenue_chicken = (chipotle_meat[chipotle_meat['item_name'] == 'Chicken Bowl']['item_price']).sum()
total_revenue_steak = (chipotle_meat[chipotle_meat['item_name'] == 'Steak Bowl']['item_price']).sum()
print('steak revenue: ',total_revenue_steak)
print('Chicken revenue: ', total_revenue_chicken)
print('Chicken made more revenue!')

steak revenue:  2260.1899999999996
Chicken revenue:  7342.7300000000005
Chicken made more revenue!


In [20]:
# 6. Find the number of missing values in the dataset:
missing_values = chipotle.isna().sum().sum()
missing_values

1246

In [21]:
# 7. Find the number of missing values in each column:
missing_values_column = chipotle.isna().sum()
missing_values_column

Unnamed: 0               0
order_id                 0
quantity                 0
item_name                0
choice_description    1246
item_price               0
dtype: int64