# Pandas DataFrames Exercises

### For several of the following exercises, you'll need to load several datasets using the pydataset library. 
- (If you get an error when trying to run the import below, use pip to install the pydataset package.)

In [1]:
from pydataset import data

initiated datasets repo at: /Users/rachel/.pydataset/


### - When the instructions say to load a dataset, you can pass the name of the dataset as a string to the data function to load the dataset. 
### - You can also view the documentation for the data set by passing the show_doc keyword argument.

In [3]:
# data('mpg', show_doc=True) # view the documentation for the dataset
mpg = data('mpg') # load the dataset and store it in a variable

### -All the datasets loaded from the pydataset library will be pandas dataframes.

## 1. Copy the code from the lesson to create a dataframe full of student grades.

In [178]:
import pandas as pd
import numpy as np

np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [179]:
df.head()

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [180]:
df.shape

(12, 4)

In [181]:
df.info

<bound method DataFrame.info of        name  math  english  reading
0     Sally    62       85       80
1      Jane    88       79       67
2     Suzie    94       74       95
3     Billy    98       96       88
4       Ada    77       92       98
5      John    79       76       93
6    Thomas    82       64       81
7     Marie    93       63       90
8    Albert    92       62       87
9   Richard    69       80       94
10    Isaac    92       99       93
11     Alan    92       62       72>

In [182]:
df.describe

<bound method NDFrame.describe of        name  math  english  reading
0     Sally    62       85       80
1      Jane    88       79       67
2     Suzie    94       74       95
3     Billy    98       96       88
4       Ada    77       92       98
5      John    79       76       93
6    Thomas    82       64       81
7     Marie    93       63       90
8    Albert    92       62       87
9   Richard    69       80       94
10    Isaac    92       99       93
11     Alan    92       62       72>

## 1a. Create a column named passing_english that indicates whether each student has a passing grade in english.

In [183]:
df.english >= 70

0      True
1      True
2      True
3      True
4      True
5      True
6     False
7     False
8     False
9      True
10     True
11    False
Name: english, dtype: bool

In [184]:
df['passing_english'] = df.english >= 70

In [185]:
df.head()

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True


In [186]:
df['passing_english'].sum()

8

In [187]:
(df['passing_english'] == False).sum()

4

In [188]:
df.passing_english.mean()

0.6666666666666666

## 1b. Sort the english grades by the passing_english column. How are duplicates handled?

In [189]:
df.sort_values(by = 'passing_english')

# Duplicates (62 in english) are sorted by ascending index order (8 forst, then 11)

Unnamed: 0,name,math,english,reading,passing_english
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
11,Alan,92,62,72,False
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True


## 1c. Sort the english grades first by passing_english and then by student name. 
- All the students that are failing english should be first, 
- and within the students that are failing english they should be ordered alphabetically. 
- The same should be true for the students passing english. 
- (Hint: you can pass a list to the .sort_values method)

In [190]:
df.sort_values(by = ['passing_english', 'name'])

Unnamed: 0,name,math,english,reading,passing_english
11,Alan,92,62,72,False
8,Albert,92,62,87,False
7,Marie,93,63,90,False
6,Thomas,82,64,81,False
4,Ada,77,92,98,True
3,Billy,98,96,88,True
10,Isaac,92,99,93,True
1,Jane,88,79,67,True
5,John,79,76,93,True
9,Richard,69,80,94,True


In [191]:
df.sort_values(by = ['passing_english', 'name'], ascending = [False, True])

Unnamed: 0,name,math,english,reading,passing_english
4,Ada,77,92,98,True
3,Billy,98,96,88,True
10,Isaac,92,99,93,True
1,Jane,88,79,67,True
5,John,79,76,93,True
9,Richard,69,80,94,True
0,Sally,62,85,80,True
2,Suzie,94,74,95,True
11,Alan,92,62,72,False
8,Albert,92,62,87,False


## 1d. Sort the english grades first by passing_english, and then by the actual english grade, similar to how we did in the last step.

In [32]:
df.sort_values(by = ['passing_english', 'english'], ascending = [False, False])

Unnamed: 0,name,math,english,reading,passing_english,overall_grade
10,Isaac,92,99,93,True,95
3,Billy,98,96,88,True,94
4,Ada,77,92,98,True,89
0,Sally,62,85,80,True,76
9,Richard,69,80,94,True,81
1,Jane,88,79,67,True,78
5,John,79,76,93,True,83
2,Suzie,94,74,95,True,88
6,Thomas,82,64,81,False,76
7,Marie,93,63,90,False,82


## 1e. Calculate each students overall grade 
- and add it as a column on the dataframe. 
- The overall grade is the average of the math, english, and reading grades.

In [42]:
df['overall_grade'] = round((df.math + df.english + df.reading) / 3, 0).astype(int)

df.sort_values(by = 'overall_grade', ascending = [False])

Unnamed: 0,name,math,english,reading,passing_english,overall_grade
10,Isaac,92,99,93,True,95
3,Billy,98,96,88,True,94
4,Ada,77,92,98,True,89
2,Suzie,94,74,95,True,88
5,John,79,76,93,True,83
7,Marie,93,63,90,False,82
9,Richard,69,80,94,True,81
8,Albert,92,62,87,False,80
1,Jane,88,79,67,True,78
0,Sally,62,85,80,True,76


## Load the mpg dataset. 
- Read the documentation for the dataset and use it for the following questions:

In [45]:
from pydataset import data

In [192]:
data('mpg', show_doc=True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [193]:
mpg = data('mpg')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## How many rows and columns are there?

In [48]:
mpg.shape

#234 rows, 11 columns

(234, 11)

In [203]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## What are the data types of each column?

In [204]:
mpg.dtypes

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
highway           int64
fl               object
class            object
dtype: object

## Summarize the dataframe with .info and .describe

In [54]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  234 non-null    object 
 1   model         234 non-null    object 
 2   displ         234 non-null    float64
 3   year          234 non-null    int64  
 4   cyl           234 non-null    int64  
 5   trans         234 non-null    object 
 6   drv           234 non-null    object 
 7   cty           234 non-null    int64  
 8   hwy           234 non-null    int64  
 9   fl            234 non-null    object 
 10  class         234 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB


In [55]:
mpg.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


## Rename the cty column to city.

In [205]:
mpg.rename(columns = {'cty': 'city'}, inplace = True)

In [206]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## Rename the hwy column to highway.

In [207]:
mpg.rename(columns = {'hwy': 'highway'}, inplace = True)

In [208]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


## Do any cars have better city mileage than highway mileage?

In [209]:
mpg.city > mpg.highway

1      False
2      False
3      False
4      False
5      False
       ...  
230    False
231    False
232    False
233    False
234    False
Length: 234, dtype: bool

In [210]:
mpg_city_higher_than_highway = mpg[mpg.city > mpg.highway]

In [211]:
mpg_city_higher_than_highway 

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class


## Create a column named mileage_difference this column should contain the difference between highway and city mileage for each car.

In [212]:
mpg['mileage_difference'] = mpg.highway - mpg.city

In [213]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10
...,...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize,9
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize,8
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize,10
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize,8


## Which car (or cars) has the highest mileage difference?

In [214]:
mpg.sort_values (by = 'mileage_difference', ascending = False).head(2)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


## Which compact class car has the lowest highway mileage? The best?

In [215]:
mpg['class'] == 'compact'

# Class is a reserved word, so we had to use the bracket quote syntax

1       True
2       True
3       True
4       True
5       True
       ...  
230    False
231    False
232    False
233    False
234    False
Name: class, Length: 234, dtype: bool

In [216]:
bool_series = mpg['class'] == 'compact'
bool_series.head()

1    True
2    True
3    True
4    True
5    True
Name: class, dtype: bool

In [217]:
compacts = mpg[bool_series]
compacts.shape

(47, 12)

In [218]:
compacts.nsmallest(1, 'highway', keep='all')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
220,volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact,7


## Create a column named average_mileage that is the mean of the city and highway mileage.

In [156]:
mpg['average_mileage'] = (mpg.highway + mpg.city) / 2

In [157]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10,21.0


## Which dodge car has the best average mileage? The worst?

In [159]:
mpg_dodge = mpg[mpg.manufacturer == 'dodge']
mpg_dodge.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
38,dodge,caravan 2wd,2.4,1999,4,auto(l3),f,18,24,r,minivan,6,21.0
39,dodge,caravan 2wd,3.0,1999,6,auto(l4),f,17,24,r,minivan,7,20.5
40,dodge,caravan 2wd,3.3,1999,6,auto(l4),f,16,22,r,minivan,6,19.0
41,dodge,caravan 2wd,3.3,1999,6,auto(l4),f,16,22,r,minivan,6,19.0
42,dodge,caravan 2wd,3.3,2008,6,auto(l4),f,17,24,r,minivan,7,20.5


In [219]:
mpg_dodge[mpg_dodge.average_mileage == mpg_dodge.average_mileage.max()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
38,dodge,caravan 2wd,2.4,1999,4,auto(l3),f,18,24,r,minivan,6,21.0


In [220]:
mpg_dodge[mpg_dodge.average_mileage == mpg_dodge.average_mileage.min()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
55,dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup,3,10.5
60,dodge,durango 4wd,4.7,2008,8,auto(l5),4,9,12,e,suv,3,10.5
66,dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup,3,10.5
70,dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,9,12,e,pickup,3,10.5


## Load the Mammals dataset. Read the documentation for it, and use the data to answer these questions:

In [115]:
from pydataset import data

In [116]:
mammals = data('Mammals')

In [117]:
data('Mammals', show_doc=True)

Mammals

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Garland(1983) Data on Running Speed of Mammals

### Description

Observations on the maximal running speed of mammal species and their body
mass.

### Usage

    data(Mammals)

### Format

A data frame with 107 observations on the following 4 variables.

weight

Body mass in Kg for "typical adult sizes"

speed

Maximal running speed (fastest sprint velocity on record)

hoppers

logical variable indicating animals that ambulate by hopping, e.g. kangaroos

specials

logical variable indicating special animals with "lifestyles in which speed
does not figure as an important factor": Hippopotamus, raccoon (Procyon),
badger (Meles), coati (Nasua), skunk (Mephitis), man (Homo), porcupine
(Erithizon), oppossum (didelphis), and sloth (Bradypus)

### Details

Used by Chappell (1989) and Koenker, Ng and Portnoy (1994) to illustrate the
fitting of piecewise linear curves.

### Source

Garland, T. (

## How many rows and columns are there?

In [118]:
mammals.shape

#107 rows, 4 columns

(107, 4)

## What are the data types?

In [119]:
mammals.dtypes

weight      float64
speed       float64
hoppers        bool
specials       bool
dtype: object

## Summarize the dataframe with .info and .describe

In [120]:
mammals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 1 to 107
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   weight    107 non-null    float64
 1   speed     107 non-null    float64
 2   hoppers   107 non-null    bool   
 3   specials  107 non-null    bool   
dtypes: bool(2), float64(2)
memory usage: 2.7 KB


In [121]:
mammals.describe()

Unnamed: 0,weight,speed
count,107.0,107.0
mean,278.688178,46.208411
std,839.608269,26.716778
min,0.016,1.6
25%,1.7,22.5
50%,34.0,48.0
75%,142.5,65.0
max,6000.0,110.0


In [125]:
mammals.head()

Unnamed: 0,weight,speed,hoppers,specials
1,6000.0,35.0,False,False
2,4000.0,26.0,False,False
3,3000.0,25.0,False,False
4,1400.0,45.0,False,False
5,400.0,70.0,False,False


## What is the the weight of the fastest animal?

In [128]:
[mammals.speed == mammals.speed.max()]

[1      False
 2      False
 3      False
 4      False
 5      False
 6      False
 7      False
 8      False
 9      False
 10     False
 11     False
 12     False
 13     False
 14     False
 15     False
 16     False
 17     False
 18     False
 19     False
 20     False
 21     False
 22     False
 23     False
 24     False
 25     False
 26     False
 27     False
 28     False
 29     False
 30     False
 31     False
 32     False
 33     False
 34     False
 35     False
 36     False
 37     False
 38     False
 39     False
 40     False
 41     False
 42     False
 43     False
 44     False
 45     False
 46     False
 47     False
 48     False
 49     False
 50     False
 51     False
 52     False
 53      True
 54     False
 55     False
 56     False
 57     False
 58     False
 59     False
 60     False
 61     False
 62     False
 63     False
 64     False
 65     False
 66     False
 67     False
 68     False
 69     False
 70     False
 71     False
 72   

In [129]:
mammals[mammals.speed == mammals.speed.max()]

Unnamed: 0,weight,speed,hoppers,specials
53,55.0,110.0,False,False


In [130]:
mammals[mammals.speed == mammals.speed.max()].weight

53    55.0
Name: weight, dtype: float64

## What is the overal percentage of specials?

In [132]:
mammals.head()

Unnamed: 0,weight,speed,hoppers,specials
1,6000.0,35.0,False,False
2,4000.0,26.0,False,False
3,3000.0,25.0,False,False
4,1400.0,45.0,False,False
5,400.0,70.0,False,False


In [134]:
mammals.specials.sum()

10

In [136]:
len(mammals)

107

In [153]:
round(mammals.specials.sum() / len(mammals) * 100,2)

9.35

## How many animals are hoppers that are above the median speed? What percentage is this?

In [221]:
mammals.head()

Unnamed: 0,weight,speed,hoppers,specials
1,6000.0,35.0,False,False
2,4000.0,26.0,False,False
3,3000.0,25.0,False,False
4,1400.0,45.0,False,False
5,400.0,70.0,False,False


In [223]:
median_speed = mammals.speed.median()
median_speed

48.0

In [225]:
[(mammals.hoppers == True) & (mammals.speed > median_speed)]

[1      False
 2      False
 3      False
 4      False
 5      False
 6      False
 7      False
 8      False
 9      False
 10     False
 11     False
 12     False
 13     False
 14     False
 15     False
 16     False
 17     False
 18     False
 19     False
 20     False
 21     False
 22     False
 23     False
 24     False
 25     False
 26     False
 27     False
 28     False
 29     False
 30     False
 31     False
 32     False
 33     False
 34     False
 35     False
 36     False
 37     False
 38     False
 39     False
 40     False
 41     False
 42     False
 43     False
 44     False
 45     False
 46     False
 47     False
 48     False
 49     False
 50     False
 51     False
 52     False
 53     False
 54     False
 55     False
 56     False
 57     False
 58     False
 59     False
 60     False
 61     False
 62     False
 63     False
 64     False
 65     False
 66     False
 67     False
 68     False
 69     False
 70     False
 71     False
 72   

In [227]:
mammals[(mammals.hoppers == True) & (mammals.speed > median_speed)]

Unnamed: 0,weight,speed,hoppers,specials
96,4.6,64.0,True,False
97,4.4,72.0,True,False
98,4.0,72.0,True,False
99,3.5,56.0,True,False
100,2.0,64.0,True,False
101,1.9,56.0,True,False
102,1.5,50.0,True,False


In [154]:
number_above = len(mammals[(mammals.hoppers == True) & (mammals.speed > mammals.speed.median())]) / len(mammals)

round(number_above * 100, 2)


6.54