# Pandas DataFrame

In [1]:
import pandas as pd

In [2]:
homelessness = pd.read_csv('datasets/homelessness.csv')

In [3]:
# Print the head of the homelessness data
homelessness.head()

Unnamed: 0.1,Unnamed: 0,region,state,individuals,family_members,state_pop
0,0,East South Central,Alabama,2570.0,864.0,4887681
1,1,Pacific,Alaska,1434.0,582.0,735139
2,2,Mountain,Arizona,7259.0,2606.0,7158024
3,3,West South Central,Arkansas,2280.0,432.0,3009733
4,4,Pacific,California,109008.0,20964.0,39461588


In [4]:
# Print information about homelessness
homelessness.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 6 columns):
Unnamed: 0        51 non-null int64
region            51 non-null object
state             51 non-null object
individuals       51 non-null float64
family_members    51 non-null float64
state_pop         51 non-null int64
dtypes: float64(2), int64(2), object(2)
memory usage: 2.5+ KB


In [5]:
# Print the shape of homelessness
homelessness.shape

(51, 6)

In [6]:
# Print a description of homelessness
homelessness.describe()

Unnamed: 0.1,Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0,51.0
mean,25.0,7225.784314,3504.882353,6405637.0
std,14.866069,15991.025083,7805.411811,7327258.0
min,0.0,434.0,75.0,577601.0
25%,12.5,1446.5,592.0,1777414.0
50%,25.0,3082.0,1482.0,4461153.0
75%,37.5,6781.5,3196.0,7340946.0
max,50.0,109008.0,52070.0,39461590.0


In [7]:
# Import pandas using the alias pd
import pandas as pd

# Print the values of homelessness
homelessness.values

array([[0, 'East South Central', 'Alabama', 2570.0, 864.0, 4887681],
       [1, 'Pacific', 'Alaska', 1434.0, 582.0, 735139],
       [2, 'Mountain', 'Arizona', 7259.0, 2606.0, 7158024],
       [3, 'West South Central', 'Arkansas', 2280.0, 432.0, 3009733],
       [4, 'Pacific', 'California', 109008.0, 20964.0, 39461588],
       [5, 'Mountain', 'Colorado', 7607.0, 3250.0, 5691287],
       [6, 'New England', 'Connecticut', 2280.0, 1696.0, 3571520],
       [7, 'South Atlantic', 'Delaware', 708.0, 374.0, 965479],
       [8, 'South Atlantic', 'District of Columbia', 3770.0, 3134.0,
        701547],
       [9, 'South Atlantic', 'Florida', 21443.0, 9587.0, 21244317],
       [10, 'South Atlantic', 'Georgia', 6943.0, 2556.0, 10511131],
       [11, 'Pacific', 'Hawaii', 4131.0, 2399.0, 1420593],
       [12, 'Mountain', 'Idaho', 1297.0, 715.0, 1750536],
       [13, 'East North Central', 'Illinois', 6752.0, 3891.0, 12723071],
       [14, 'East North Central', 'Indiana', 3776.0, 1482.0, 6695497],
    

In [8]:
# Print the column index of homelessness
homelessness.columns

Index(['Unnamed: 0', 'region', 'state', 'individuals', 'family_members',
       'state_pop'],
      dtype='object')

In [9]:
# Print the row index of homelessness
homelessness.index

RangeIndex(start=0, stop=51, step=1)

In [10]:
fam_lt_1k_pac = homelessness[(homelessness['family_members'] < 1000) & (homelessness['region']== "Pacific")]

# See the result
print(fam_lt_1k_pac)

   Unnamed: 0   region   state  individuals  family_members  state_pop
1           1  Pacific  Alaska       1434.0           582.0     735139


### or alternative `|`

In [11]:
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]

# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness['state'].isin(canu)]

# See the result
print(mojave_homelessness)

    Unnamed: 0    region       state  individuals  family_members  state_pop
2            2  Mountain     Arizona       7259.0          2606.0    7158024
4            4   Pacific  California     109008.0         20964.0   39461588
28          28  Mountain      Nevada       7058.0           486.0    3027341
44          44  Mountain        Utah       1904.0           972.0    3153550


## Add column

In [12]:
# Add total col as sum of individuals and family_members
homelessness['total'] = homelessness['individuals'] + homelessness['family_members']

# Add p_individuals col as proportion of total that are individuals
homelessness['p_individuals'] = homelessness['individuals']/homelessness['total']

# See the result
print(homelessness)

    Unnamed: 0              region                 state  individuals  \
0            0  East South Central               Alabama       2570.0   
1            1             Pacific                Alaska       1434.0   
2            2            Mountain               Arizona       7259.0   
3            3  West South Central              Arkansas       2280.0   
4            4             Pacific            California     109008.0   
5            5            Mountain              Colorado       7607.0   
6            6         New England           Connecticut       2280.0   
7            7      South Atlantic              Delaware        708.0   
8            8      South Atlantic  District of Columbia       3770.0   
9            9      South Atlantic               Florida      21443.0   
10          10      South Atlantic               Georgia       6943.0   
11          11             Pacific                Hawaii       4131.0   
12          12            Mountain                 

In [13]:
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness['individuals'] / homelessness['state_pop'] 

# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness['indiv_per_10k'] > 20]

# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = high_homelessness.sort_values("indiv_per_10k",ascending=False)

# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[['state','indiv_per_10k']]

# See the result
print(result)

                   state  indiv_per_10k
8   District of Columbia      53.738381
11                Hawaii      29.079406
4             California      27.623825
37                Oregon      26.636307
28                Nevada      23.314189
47            Washington      21.829195
32              New York      20.392363


# Summary Statistics

In [14]:
sales = pd.read_csv('datasets/sales_subset.csv')

In [15]:
# Print the head of the sales DataFrame
print(sales.head())

# Print the info about the sales DataFrame
print(sales.info())

# Print the mean of weekly_sales
print(sales['weekly_sales'].mean())

# Print the median of weekly_sales
print(sales['weekly_sales'].median())

   Unnamed: 0  store type  department        date  weekly_sales  is_holiday  \
0           0      1    A           1  2010-02-05      24924.50       False   
1           1      1    A           1  2010-03-05      21827.90       False   
2           2      1    A           1  2010-04-02      57258.43       False   
3           3      1    A           1  2010-05-07      17413.94       False   
4           4      1    A           1  2010-06-04      17558.09       False   

   temperature_c  fuel_price_usd_per_l  unemployment  
0       5.727778              0.679451         8.106  
1       8.055556              0.693452         8.106  
2      16.816667              0.718284         7.808  
3      22.527778              0.748928         7.808  
4      27.050000              0.714586         7.808  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10774 entries, 0 to 10773
Data columns (total 10 columns):
Unnamed: 0              10774 non-null int64
store                   10774 non-null in

In [16]:
max(sales['date'])

'2012-10-26'

In [17]:
min(sales['date'])

'2010-02-05'

In [18]:
# Import NumPy and create custom IQR function
import numpy as np
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
print(sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median]))

        temperature_c  fuel_price_usd_per_l  unemployment
iqr         16.583333              0.073176         0.565
median      16.966667              0.743381         8.099


In [19]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=['store','type'])
store_types.head()

Unnamed: 0.1,Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
901,901,2,A,1,2010-02-05,35034.06,False,4.55,0.679451,8.324
1798,1798,4,A,1,2010-02-05,38724.42,False,6.533333,0.686319,8.623
2699,2699,6,A,1,2010-02-05,25619.0,False,4.683333,0.679451,7.259
3593,3593,10,B,1,2010-02-05,40212.84,False,12.411111,0.782478,9.765


In [20]:
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=['store','department'])
store_depts.head()

Unnamed: 0.1,Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
12,12,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
24,24,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
36,36,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
48,48,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106


In [21]:
# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]==True].drop_duplicates(subset='date')

In [22]:
holiday_dates['date']

498     2010-09-10
691     2011-11-25
2315    2010-02-12
6735    2012-09-07
6810    2010-12-31
6815    2012-02-10
6820    2011-09-09
Name: date, dtype: object

### Sorting within `value_counts()`

In [23]:
# Count the number of stores of each type
store_counts = store_types['type'].value_counts()
store_counts

A    11
B     1
Name: type, dtype: int64

In [24]:
# Get the proportion of stores of each type
store_props = store_types['type'].value_counts(normalize=True)
store_props


A    0.916667
B    0.083333
Name: type, dtype: float64

In [25]:
# Count the number of each department number and sort
dept_counts_sorted = store_depts['department'].value_counts(sort=True)
dept_counts_sorted


41    12
30    12
23    12
24    12
25    12
26    12
27    12
28    12
29    12
31    12
21    12
32    12
33    12
34    12
35    12
36    12
38    12
40    12
22    12
20    12
42    12
9     12
2     12
3     12
4     12
5     12
6     12
7     12
8     12
10    12
      ..
82    12
83    12
85    12
87    12
90    12
91    12
67    12
60    12
59    12
58    12
56    12
55    12
54    12
52    12
51    12
94    12
49    12
95    12
47    12
46    12
96    12
45    12
97    12
92    12
99    11
37    10
48     8
50     6
39     4
43     2
Name: department, Length: 80, dtype: int64

In [26]:
# Calc total weekly sales
sales_all = sales["weekly_sales"].sum()

# Subset for type A stores, calc total weekly sales
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()

# Subset for type B stores, calc total weekly sales
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum()

# Subset for type C stores, calc total weekly sales
sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()

# Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
print(sales_propn_by_type)

[0.9097747 0.0902253 0.       ]


In [27]:
# Import numpy with the alias np
import numpy as np

# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")['weekly_sales'].agg([min, max, np.mean, np.median])

# Print sales_stats
print(sales_stats)

# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby("type")[['unemployment','fuel_price_usd_per_l']].agg([min,max,np.mean,np.median])

# Print unemp_fuel_stats
print(unemp_fuel_stats)

         min        max          mean    median
type                                           
A    -1098.0  293966.05  23674.667242  11943.92
B     -798.0  232558.51  25696.678370  13336.08
     unemployment                         fuel_price_usd_per_l            \
              min    max      mean median                  min       max   
type                                                                       
A           3.879  8.992  7.972611  8.067             0.664129  1.107410   
B           7.170  9.765  9.279323  9.199             0.760023  1.107674   

                          
          mean    median  
type                      
A     0.744619  0.735455  
B     0.805858  0.803348  


In [28]:
# Pivot for mean weekly_sales by store type and holiday 
mean_sales_by_type_holiday = sales.pivot_table(values = 'weekly_sales',index='type',columns='is_holiday')

# Print mean_sales_by_type_holiday
print(mean_sales_by_type_holiday)

is_holiday         False      True 
type                               
A           23768.583523  590.04525
B           25751.980533  810.70500


In [29]:
# Print mean weekly_sales by department and type; fill missing values with 0
print(sales.pivot_table(values = 'weekly_sales', index='department', columns='type', fill_value=0))

type                    A              B
department                              
1            30961.725379   44050.626667
2            67600.158788  112958.526667
3            17160.002955   30580.655000
4            44285.399091   51219.654167
5            34821.011364   63236.875000
6             7136.292652   10717.297500
7            38454.336818   52909.653333
8            48583.475303   90733.753333
9            30120.449924   66679.301667
10           30930.456364   48595.126667
11           23028.312727   35488.429167
12            6786.840606    9656.520000
13           51398.168561   67213.587500
14           22457.695303   40400.020000
16           25202.751894   29558.182500
17           16167.586136   27675.351667
18           12201.771212   17361.347500
19            1560.951719    3365.895000
20            8312.070227   16191.810000
21            9324.387197   10368.968333
22           14225.324167   26044.797500
23           29350.745076   63960.273333
24            82

In [30]:
sales.pivot_table(values="weekly_sales", index="department", columns="type",  fill_value=0, margins=True)

type,A,B,All
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,30961.725379,44050.626667,32052.467153
2,67600.158788,112958.526667,71380.022778
3,17160.002955,30580.655000,18278.390625
4,44285.399091,51219.654167,44863.253681
5,34821.011364,63236.875000,37189.000000
6,7136.292652,10717.297500,7434.709722
7,38454.336818,52909.653333,39658.946528
8,48583.475303,90733.753333,52095.998472
9,30120.449924,66679.301667,33167.020903
10,30930.456364,48595.126667,32402.512222


In [31]:
temperatures= pd.read_csv('datasets/temperatures.csv')

In [32]:
# Look at temperatures
temperatures

Unnamed: 0.1,Unnamed: 0,date,city,country,avg_temp_c
0,0,2000-01-01,Abidjan,Côte D'Ivoire,27.293
1,1,2000-02-01,Abidjan,Côte D'Ivoire,27.685
2,2,2000-03-01,Abidjan,Côte D'Ivoire,29.061
3,3,2000-04-01,Abidjan,Côte D'Ivoire,28.162
4,4,2000-05-01,Abidjan,Côte D'Ivoire,27.547
5,5,2000-06-01,Abidjan,Côte D'Ivoire,25.812
6,6,2000-07-01,Abidjan,Côte D'Ivoire,24.870
7,7,2000-08-01,Abidjan,Côte D'Ivoire,24.884
8,8,2000-09-01,Abidjan,Côte D'Ivoire,25.405
9,9,2000-10-01,Abidjan,Côte D'Ivoire,26.074


In [33]:
# Set the index of temperatures to city
temperatures_ind = temperatures.set_index('city')

In [34]:
# Look at temperatures_ind
temperatures_ind

Unnamed: 0_level_0,Unnamed: 0,date,country,avg_temp_c
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abidjan,0,2000-01-01,Côte D'Ivoire,27.293
Abidjan,1,2000-02-01,Côte D'Ivoire,27.685
Abidjan,2,2000-03-01,Côte D'Ivoire,29.061
Abidjan,3,2000-04-01,Côte D'Ivoire,28.162
Abidjan,4,2000-05-01,Côte D'Ivoire,27.547
Abidjan,5,2000-06-01,Côte D'Ivoire,25.812
Abidjan,6,2000-07-01,Côte D'Ivoire,24.870
Abidjan,7,2000-08-01,Côte D'Ivoire,24.884
Abidjan,8,2000-09-01,Côte D'Ivoire,25.405
Abidjan,9,2000-10-01,Côte D'Ivoire,26.074


In [35]:
# Reset the temperatures_ind index, keeping its contents
temperatures_ind.reset_index()

Unnamed: 0.1,city,Unnamed: 0,date,country,avg_temp_c
0,Abidjan,0,2000-01-01,Côte D'Ivoire,27.293
1,Abidjan,1,2000-02-01,Côte D'Ivoire,27.685
2,Abidjan,2,2000-03-01,Côte D'Ivoire,29.061
3,Abidjan,3,2000-04-01,Côte D'Ivoire,28.162
4,Abidjan,4,2000-05-01,Côte D'Ivoire,27.547
5,Abidjan,5,2000-06-01,Côte D'Ivoire,25.812
6,Abidjan,6,2000-07-01,Côte D'Ivoire,24.870
7,Abidjan,7,2000-08-01,Côte D'Ivoire,24.884
8,Abidjan,8,2000-09-01,Côte D'Ivoire,25.405
9,Abidjan,9,2000-10-01,Côte D'Ivoire,26.074


In [36]:
# Reset the temperatures_ind index, dropping its contents
temperatures_ind.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,date,country,avg_temp_c
0,0,2000-01-01,Côte D'Ivoire,27.293
1,1,2000-02-01,Côte D'Ivoire,27.685
2,2,2000-03-01,Côte D'Ivoire,29.061
3,3,2000-04-01,Côte D'Ivoire,28.162
4,4,2000-05-01,Côte D'Ivoire,27.547
5,5,2000-06-01,Côte D'Ivoire,25.812
6,6,2000-07-01,Côte D'Ivoire,24.870
7,7,2000-08-01,Côte D'Ivoire,24.884
8,8,2000-09-01,Côte D'Ivoire,25.405
9,9,2000-10-01,Côte D'Ivoire,26.074


In [37]:
# Make a list of cities to subset on
cities = ["Moscow", "Saint Petersburg"]

In [38]:
# Subset temperatures using square brackets
temperatures[temperatures['city'].isin(cities)]

Unnamed: 0.1,Unnamed: 0,date,city,country,avg_temp_c
10725,10725,2000-01-01,Moscow,Russia,-7.313
10726,10726,2000-02-01,Moscow,Russia,-3.551
10727,10727,2000-03-01,Moscow,Russia,-1.661
10728,10728,2000-04-01,Moscow,Russia,10.096
10729,10729,2000-05-01,Moscow,Russia,10.357
10730,10730,2000-06-01,Moscow,Russia,15.243
10731,10731,2000-07-01,Moscow,Russia,18.676
10732,10732,2000-08-01,Moscow,Russia,16.420
10733,10733,2000-09-01,Moscow,Russia,9.775
10734,10734,2000-10-01,Moscow,Russia,6.611


In [39]:
temperatures_ind.loc[cities]

Unnamed: 0_level_0,Unnamed: 0,date,country,avg_temp_c
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Moscow,10725,2000-01-01,Russia,-7.313
Moscow,10726,2000-02-01,Russia,-3.551
Moscow,10727,2000-03-01,Russia,-1.661
Moscow,10728,2000-04-01,Russia,10.096
Moscow,10729,2000-05-01,Russia,10.357
Moscow,10730,2000-06-01,Russia,15.243
Moscow,10731,2000-07-01,Russia,18.676
Moscow,10732,2000-08-01,Russia,16.420
Moscow,10733,2000-09-01,Russia,9.775
Moscow,10734,2000-10-01,Russia,6.611


In [40]:
# Index temperatures by country & city
temperatures_ind = temperatures.set_index(['country', 'city'])

In [41]:
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [("Brazil","Rio De Janeiro"),("Pakistan", "Lahore")]


In [42]:
# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

                         Unnamed: 0        date  avg_temp_c
country  city                                              
Brazil   Rio De Janeiro       12540  2000-01-01      25.974
         Rio De Janeiro       12541  2000-02-01      26.699
         Rio De Janeiro       12542  2000-03-01      26.270
         Rio De Janeiro       12543  2000-04-01      25.750
         Rio De Janeiro       12544  2000-05-01      24.356
         Rio De Janeiro       12545  2000-06-01      24.059
         Rio De Janeiro       12546  2000-07-01      22.049
         Rio De Janeiro       12547  2000-08-01      22.447
         Rio De Janeiro       12548  2000-09-01      22.099
         Rio De Janeiro       12549  2000-10-01      23.844
         Rio De Janeiro       12550  2000-11-01      23.601
         Rio De Janeiro       12551  2000-12-01      25.209
         Rio De Janeiro       12552  2001-01-01      26.852
         Rio De Janeiro       12553  2001-02-01      27.992
         Rio De Janeiro       12554  200

In [43]:
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())

# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level = 'city'))

# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["country","city"], ascending = [True, False]))

                    Unnamed: 0        date  avg_temp_c
country     city                                      
Afghanistan Kabul         7260  2000-01-01       3.326
            Kabul         7261  2000-02-01       3.454
            Kabul         7262  2000-03-01       9.612
            Kabul         7263  2000-04-01      17.925
            Kabul         7264  2000-05-01      24.658
            Kabul         7265  2000-06-01      25.582
            Kabul         7266  2000-07-01      26.107
            Kabul         7267  2000-08-01      25.459
            Kabul         7268  2000-09-01      22.116
            Kabul         7269  2000-10-01      16.806
            Kabul         7270  2000-11-01       9.720
            Kabul         7271  2000-12-01       5.107
            Kabul         7272  2001-01-01       2.208
            Kabul         7273  2001-02-01       5.567
            Kabul         7274  2001-03-01      10.807
            Kabul         7275  2001-04-01      16.587
          

In [44]:
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()


In [45]:

# Subset rows from Pakistan to Russia
print(temperatures_srt.loc['Pakistan':'Russia'])


                           Unnamed: 0        date  avg_temp_c
country  city                                                
Pakistan Faisalabad              4785  2000-01-01      12.792
         Faisalabad              4786  2000-02-01      14.339
         Faisalabad              4787  2000-03-01      20.309
         Faisalabad              4788  2000-04-01      29.072
         Faisalabad              4789  2000-05-01      34.845
         Faisalabad              4790  2000-06-01      34.299
         Faisalabad              4791  2000-07-01      32.302
         Faisalabad              4792  2000-08-01      32.255
         Faisalabad              4793  2000-09-01      30.438
         Faisalabad              4794  2000-10-01      27.395
         Faisalabad              4795  2000-11-01      20.640
         Faisalabad              4796  2000-12-01      15.195
         Faisalabad              4797  2001-01-01      11.853
         Faisalabad              4798  2001-02-01      16.701
        

In [46]:

# Try to subset rows from Lahore to Moscow
print(temperatures_srt.loc['Lahore':"Moscow"])



                    Unnamed: 0        date  avg_temp_c
country city                                          
Mexico  Mexico           10230  2000-01-01      12.694
        Mexico           10231  2000-02-01      14.677
        Mexico           10232  2000-03-01      17.376
        Mexico           10233  2000-04-01      18.294
        Mexico           10234  2000-05-01      18.562
        Mexico           10235  2000-06-01      17.270
        Mexico           10236  2000-07-01      17.723
        Mexico           10237  2000-08-01      16.967
        Mexico           10238  2000-09-01      16.996
        Mexico           10239  2000-10-01      15.605
        Mexico           10240  2000-11-01      15.237
        Mexico           10241  2000-12-01      11.966
        Mexico           10242  2001-01-01      12.733
        Mexico           10243  2001-02-01      14.461
        Mexico           10244  2001-03-01      15.926
        Mexico           10245  2001-04-01      18.156
        Me

In [47]:
# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[('Pakistan','Lahore'):('Russia','Moscow')])


                 Unnamed: 0        date  avg_temp_c
country  city                                      
Pakistan Lahore        8415  2000-01-01      12.792
         Lahore        8416  2000-02-01      14.339
         Lahore        8417  2000-03-01      20.309
         Lahore        8418  2000-04-01      29.072
         Lahore        8419  2000-05-01      34.845
         Lahore        8420  2000-06-01      34.299
         Lahore        8421  2000-07-01      32.302
         Lahore        8422  2000-08-01      32.255
         Lahore        8423  2000-09-01      30.438
         Lahore        8424  2000-10-01      27.395
         Lahore        8425  2000-11-01      20.640
         Lahore        8426  2000-12-01      15.195
         Lahore        8427  2001-01-01      11.853
         Lahore        8428  2001-02-01      16.701
         Lahore        8429  2001-03-01      21.885
         Lahore        8430  2001-04-01      26.814
         Lahore        8431  2001-05-01      33.924
         Lah

In [48]:
# Subset rows from India, Hyderabad to Iraq, Baghdad
print(temperatures_srt.loc[('India', 'Hyderabad'):('Iraq','Baghdad')])


                   Unnamed: 0        date  avg_temp_c
country city                                         
India   Hyderabad        5940  2000-01-01      23.779
        Hyderabad        5941  2000-02-01      25.826
        Hyderabad        5942  2000-03-01      28.821
        Hyderabad        5943  2000-04-01      32.698
        Hyderabad        5944  2000-05-01      32.438
        Hyderabad        5945  2000-06-01      28.422
        Hyderabad        5946  2000-07-01      27.137
        Hyderabad        5947  2000-08-01      26.576
        Hyderabad        5948  2000-09-01      27.433
        Hyderabad        5949  2000-10-01      26.927
        Hyderabad        5950  2000-11-01      24.776
        Hyderabad        5951  2000-12-01      21.949
        Hyderabad        5952  2001-01-01      23.406
        Hyderabad        5953  2001-02-01      26.677
        Hyderabad        5954  2001-03-01      29.393
        Hyderabad        5955  2001-04-01      31.289
        Hyderabad        595

In [49]:

# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:,'date':'avg_temp_c'])


                          date  avg_temp_c
country     city                          
Afghanistan Kabul   2000-01-01       3.326
            Kabul   2000-02-01       3.454
            Kabul   2000-03-01       9.612
            Kabul   2000-04-01      17.925
            Kabul   2000-05-01      24.658
            Kabul   2000-06-01      25.582
            Kabul   2000-07-01      26.107
            Kabul   2000-08-01      25.459
            Kabul   2000-09-01      22.116
            Kabul   2000-10-01      16.806
            Kabul   2000-11-01       9.720
            Kabul   2000-12-01       5.107
            Kabul   2001-01-01       2.208
            Kabul   2001-02-01       5.567
            Kabul   2001-03-01      10.807
            Kabul   2001-04-01      16.587
            Kabul   2001-05-01      23.782
            Kabul   2001-06-01      25.905
            Kabul   2001-07-01      26.065
            Kabul   2001-08-01      25.149
            Kabul   2001-09-01      21.178
           

In [50]:

# Subset in both directions at once
print(temperatures_srt.loc[('India', 'Hyderabad'):('Iraq','Baghdad'),'date':'avg_temp_c'])

                         date  avg_temp_c
country city                             
India   Hyderabad  2000-01-01      23.779
        Hyderabad  2000-02-01      25.826
        Hyderabad  2000-03-01      28.821
        Hyderabad  2000-04-01      32.698
        Hyderabad  2000-05-01      32.438
        Hyderabad  2000-06-01      28.422
        Hyderabad  2000-07-01      27.137
        Hyderabad  2000-08-01      26.576
        Hyderabad  2000-09-01      27.433
        Hyderabad  2000-10-01      26.927
        Hyderabad  2000-11-01      24.776
        Hyderabad  2000-12-01      21.949
        Hyderabad  2001-01-01      23.406
        Hyderabad  2001-02-01      26.677
        Hyderabad  2001-03-01      29.393
        Hyderabad  2001-04-01      31.289
        Hyderabad  2001-05-01      34.030
        Hyderabad  2001-06-01      29.432
        Hyderabad  2001-07-01      28.309
        Hyderabad  2001-08-01      26.670
        Hyderabad  2001-09-01      27.698
        Hyderabad  2001-10-01     

In [51]:
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures['date'] >= '2010-01-01') & (temperatures['date'] <= '2011-12-31')]
print(temperatures_bool)


       Unnamed: 0        date         city        country  avg_temp_c
120           120  2010-01-01      Abidjan  Côte D'Ivoire      28.270
121           121  2010-02-01      Abidjan  Côte D'Ivoire      29.262
122           122  2010-03-01      Abidjan  Côte D'Ivoire      29.596
123           123  2010-04-01      Abidjan  Côte D'Ivoire      29.068
124           124  2010-05-01      Abidjan  Côte D'Ivoire      28.258
125           125  2010-06-01      Abidjan  Côte D'Ivoire      26.683
126           126  2010-07-01      Abidjan  Côte D'Ivoire      25.589
127           127  2010-08-01      Abidjan  Côte D'Ivoire      25.400
128           128  2010-09-01      Abidjan  Côte D'Ivoire      25.710
129           129  2010-10-01      Abidjan  Côte D'Ivoire      26.397
130           130  2010-11-01      Abidjan  Côte D'Ivoire      27.446
131           131  2010-12-01      Abidjan  Côte D'Ivoire      27.666
132           132  2011-01-01      Abidjan  Côte D'Ivoire      27.360
133           133  2

In [52]:

# Set date as the index and sort the index
temperatures_ind = temperatures.set_index('date').sort_index()


In [53]:

# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011
print(temperatures_ind.loc['2010':'2011'])

            Unnamed: 0            city       country  avg_temp_c
date                                                            
2010-01-01        4905      Faisalabad      Pakistan      11.810
2010-01-01       10185       Melbourne     Australia      20.016
2010-01-01        3750       Chongqing         China       7.921
2010-01-01       13155       São Paulo        Brazil      23.738
2010-01-01        5400       Guangzhou         China      14.136
2010-01-01        8370           Lagos       Nigeria      27.830
2010-01-01        4410           Delhi         India      14.014
2010-01-01        8865            Lima          Peru      20.450
2010-01-01       10020         Mashhad          Iran       4.746
2010-01-01        3915           Dakar       Senegal      23.188
2010-01-01        4575           Dhaka    Bangladesh      17.354
2010-01-01        1605  Belo Horizonte        Brazil      24.349
2010-01-01        2265        Brasília        Brazil      23.417
2010-01-01       10515   

In [54]:
# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc['08-2010':"02-2011"])

Empty DataFrame
Columns: [Unnamed: 0, city, country, avg_temp_c]
Index: []


In [55]:
# Get 23rd row, 2nd column (index 22, 1)
print(temperatures.iloc[22,1])
# Use slicing to get the first 5 rows
print(temperatures.iloc[0:5])

# Use slicing to get columns 3 to 4
print(temperatures.iloc[:,2:4])

# Use slicing in both directions at once
print(temperatures.iloc[0:5,2:4])

2001-11-01
   Unnamed: 0        date     city        country  avg_temp_c
0           0  2000-01-01  Abidjan  Côte D'Ivoire      27.293
1           1  2000-02-01  Abidjan  Côte D'Ivoire      27.685
2           2  2000-03-01  Abidjan  Côte D'Ivoire      29.061
3           3  2000-04-01  Abidjan  Côte D'Ivoire      28.162
4           4  2000-05-01  Abidjan  Côte D'Ivoire      27.547
          city        country
0      Abidjan  Côte D'Ivoire
1      Abidjan  Côte D'Ivoire
2      Abidjan  Côte D'Ivoire
3      Abidjan  Côte D'Ivoire
4      Abidjan  Côte D'Ivoire
5      Abidjan  Côte D'Ivoire
6      Abidjan  Côte D'Ivoire
7      Abidjan  Côte D'Ivoire
8      Abidjan  Côte D'Ivoire
9      Abidjan  Côte D'Ivoire
10     Abidjan  Côte D'Ivoire
11     Abidjan  Côte D'Ivoire
12     Abidjan  Côte D'Ivoire
13     Abidjan  Côte D'Ivoire
14     Abidjan  Côte D'Ivoire
15     Abidjan  Côte D'Ivoire
16     Abidjan  Côte D'Ivoire
17     Abidjan  Côte D'Ivoire
18     Abidjan  Côte D'Ivoire
19     Abidjan  C

## Panda viz

In [56]:
avocados = pd.read_pickle('datasets/avoplotto.pkl')

ModuleNotFoundError: No module named 'pandas.core.internals.managers'; 'pandas.core.internals' is not a package