# Exploring a DataFrame

In [2]:
import pandas as pd

In [3]:
dogs_dict = {
    'breed': ['Labrador', 'Poodle', 'Chow Chow', 'Schnauzer', 'Labrador', 'Chihuahua', 'Poodle', 'Chihuahua', 'Labrador', 'Labrador'],
    'color': ['Chocolate', 'White', 'Brown', 'Gray', 'Black', 'Brown', 'White', 'Black', 'Yellow', 'Black'],
    'name': ['Buddy', 'Lucy', 'Cooper', 'Riley', 'Bear', 'Bella', 'Daisy', 'Lola', 'Max', 'Stella'],
    'height_cm': [56, 43, 46, 49, 56, 18, 43, 18, 59, 56],
    'weight_kg': [25, 6, 23, 17, 29, 2, 7, 2, 29, 29],
    'date_of_birth': ['2013-07-01', '2016-10-11', '2011-10-12', '2014-09-01', '2017-01-20', '2015-04-20', '2017-01-20', '2015-08-25', '2017-01-20', '2017-10-05']
}

dogs = pd.DataFrame(dogs_dict)

In [4]:
dogs.head() # first 5 rows

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
0,Labrador,Chocolate,Buddy,56,25,2013-07-01
1,Poodle,White,Lucy,43,6,2016-10-11
2,Chow Chow,Brown,Cooper,46,23,2011-10-12
3,Schnauzer,Gray,Riley,49,17,2014-09-01
4,Labrador,Black,Bear,56,29,2017-01-20


In [5]:
dogs.info() # shows the data types of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   breed          10 non-null     object
 1   color          10 non-null     object
 2   name           10 non-null     object
 3   height_cm      10 non-null     int64 
 4   weight_kg      10 non-null     int64 
 5   date_of_birth  10 non-null     object
dtypes: int64(2), object(4)
memory usage: 608.0+ bytes


In [6]:
dogs.shape # rows, columns

(10, 6)

In [7]:
dogs.describe() # summary statistics

Unnamed: 0,height_cm,weight_kg
count,10.0,10.0
mean,44.4,16.9
std,15.049548,11.56095
min,18.0,2.0
25%,43.0,6.25
50%,47.5,20.0
75%,56.0,28.0
max,59.0,29.0


In [8]:
dogs.values # returns a numpy array

array([['Labrador', 'Chocolate', 'Buddy', 56, 25, '2013-07-01'],
       ['Poodle', 'White', 'Lucy', 43, 6, '2016-10-11'],
       ['Chow Chow', 'Brown', 'Cooper', 46, 23, '2011-10-12'],
       ['Schnauzer', 'Gray', 'Riley', 49, 17, '2014-09-01'],
       ['Labrador', 'Black', 'Bear', 56, 29, '2017-01-20'],
       ['Chihuahua', 'Brown', 'Bella', 18, 2, '2015-04-20'],
       ['Poodle', 'White', 'Daisy', 43, 7, '2017-01-20'],
       ['Chihuahua', 'Black', 'Lola', 18, 2, '2015-08-25'],
       ['Labrador', 'Yellow', 'Max', 59, 29, '2017-01-20'],
       ['Labrador', 'Black', 'Stella', 56, 29, '2017-10-05']],
      dtype=object)

In [9]:
dogs.columns # returns a list of column names

Index(['breed', 'color', 'name', 'height_cm', 'weight_kg', 'date_of_birth'], dtype='object')

In [10]:
dogs.index # returns a list of row indices

RangeIndex(start=0, stop=10, step=1)

# Sorting

In [11]:
dogs.sort_values("weight_kg")

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
5,Chihuahua,Brown,Bella,18,2,2015-04-20
7,Chihuahua,Black,Lola,18,2,2015-08-25
1,Poodle,White,Lucy,43,6,2016-10-11
6,Poodle,White,Daisy,43,7,2017-01-20
3,Schnauzer,Gray,Riley,49,17,2014-09-01
2,Chow Chow,Brown,Cooper,46,23,2011-10-12
0,Labrador,Chocolate,Buddy,56,25,2013-07-01
4,Labrador,Black,Bear,56,29,2017-01-20
8,Labrador,Yellow,Max,59,29,2017-01-20
9,Labrador,Black,Stella,56,29,2017-10-05


In [12]:
dogs.sort_values(["weight_kg", "height_cm"], ascending=[True, False]) # sort by weight ascending, then height descending

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
5,Chihuahua,Brown,Bella,18,2,2015-04-20
7,Chihuahua,Black,Lola,18,2,2015-08-25
1,Poodle,White,Lucy,43,6,2016-10-11
6,Poodle,White,Daisy,43,7,2017-01-20
3,Schnauzer,Gray,Riley,49,17,2014-09-01
2,Chow Chow,Brown,Cooper,46,23,2011-10-12
0,Labrador,Chocolate,Buddy,56,25,2013-07-01
8,Labrador,Yellow,Max,59,29,2017-01-20
4,Labrador,Black,Bear,56,29,2017-01-20
9,Labrador,Black,Stella,56,29,2017-10-05


# Subsetting

In [13]:
dogs["name"] # select a single column

0     Buddy
1      Lucy
2    Cooper
3     Riley
4      Bear
5     Bella
6     Daisy
7      Lola
8       Max
9    Stella
Name: name, dtype: object

In [15]:
dogs[["breed", "height_cm"]] # subset by columns

Unnamed: 0,breed,height_cm
0,Labrador,56
1,Poodle,43
2,Chow Chow,46
3,Schnauzer,49
4,Labrador,56
5,Chihuahua,18
6,Poodle,43
7,Chihuahua,18
8,Labrador,59
9,Labrador,56


In [16]:
dogs[dogs["height_cm"] > 50] # subset by rows

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
0,Labrador,Chocolate,Buddy,56,25,2013-07-01
4,Labrador,Black,Bear,56,29,2017-01-20
8,Labrador,Yellow,Max,59,29,2017-01-20
9,Labrador,Black,Stella,56,29,2017-10-05


In [17]:
dogs[dogs["breed"] == "Labrador"] 

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
0,Labrador,Chocolate,Buddy,56,25,2013-07-01
4,Labrador,Black,Bear,56,29,2017-01-20
8,Labrador,Yellow,Max,59,29,2017-01-20
9,Labrador,Black,Stella,56,29,2017-10-05


In [18]:
dogs[dogs["date_of_birth"] > "2015-01-01"] # subset by dates

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
1,Poodle,White,Lucy,43,6,2016-10-11
4,Labrador,Black,Bear,56,29,2017-01-20
5,Chihuahua,Brown,Bella,18,2,2015-04-20
6,Poodle,White,Daisy,43,7,2017-01-20
7,Chihuahua,Black,Lola,18,2,2015-08-25
8,Labrador,Yellow,Max,59,29,2017-01-20
9,Labrador,Black,Stella,56,29,2017-10-05


In [19]:
is_lab = dogs["breed"] == "Chow Chow"
is_brown = dogs["color"] == "Brown"
dogs[is_lab & is_brown] # subset by multiple conditions

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
2,Chow Chow,Brown,Cooper,46,23,2011-10-12


In [20]:
is_black_or_brown = dogs["color"].isin(["Black", "Brown"])
dogs[is_black_or_brown]

Unnamed: 0,breed,color,name,height_cm,weight_kg,date_of_birth
2,Chow Chow,Brown,Cooper,46,23,2011-10-12
4,Labrador,Black,Bear,56,29,2017-01-20
5,Chihuahua,Brown,Bella,18,2,2015-04-20
7,Chihuahua,Black,Lola,18,2,2015-08-25
9,Labrador,Black,Stella,56,29,2017-10-05


# New Columns

In [22]:
dogs["height_m"] = dogs["height_cm"] / 100
print(dogs)

       breed      color    name  height_cm  weight_kg date_of_birth  height_m
0   Labrador  Chocolate   Buddy         56         25    2013-07-01      0.56
1     Poodle      White    Lucy         43          6    2016-10-11      0.43
2  Chow Chow      Brown  Cooper         46         23    2011-10-12      0.46
3  Schnauzer       Gray   Riley         49         17    2014-09-01      0.49
4   Labrador      Black    Bear         56         29    2017-01-20      0.56
5  Chihuahua      Brown   Bella         18          2    2015-04-20      0.18
6     Poodle      White   Daisy         43          7    2017-01-20      0.43
7  Chihuahua      Black    Lola         18          2    2015-08-25      0.18
8   Labrador     Yellow     Max         59         29    2017-01-20      0.59
9   Labrador      Black  Stella         56         29    2017-10-05      0.56


In [23]:
dogs["bmi"] = dogs["weight_kg"] / dogs["height_m"] ** 2
print(dogs.head())

       breed      color    name  height_cm  weight_kg date_of_birth  height_m  \
0   Labrador  Chocolate   Buddy         56         25    2013-07-01      0.56   
1     Poodle      White    Lucy         43          6    2016-10-11      0.43   
2  Chow Chow      Brown  Cooper         46         23    2011-10-12      0.46   
3  Schnauzer       Gray   Riley         49         17    2014-09-01      0.49   
4   Labrador      Black    Bear         56         29    2017-01-20      0.56   

          bmi  
0   79.719388  
1   32.449973  
2  108.695652  
3   70.803832  
4   92.474490  


In [24]:
bmi_lt_100 = dogs[dogs["bmi"] < 100]
bmi_lt_100_height = bmi_lt_100.sort_values("height_cm", ascending=False)
bmi_lt_100_height[["name", "height_cm", "bmi"]]

Unnamed: 0,name,height_cm,bmi
8,Max,59,83.309394
0,Buddy,56,79.719388
4,Bear,56,92.47449
9,Stella,56,92.47449
3,Riley,49,70.803832
1,Lucy,43,32.449973
6,Daisy,43,37.858302
5,Bella,18,61.728395
7,Lola,18,61.728395
