In [1]:
import datetime

import numpy as np
import pandas as pd
import pandas_datareader as pdr  # IF NECESSARY, from terminal: pip install pandas_datareader
from numpy.random import default_rng

pd.set_option("display.max_rows", 10)  # display option for pandas
# more here: https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html

## A quick hit of Numpy

In [2]:
# create a random vector 
# every run of this --> diff #s
# see 3.2.2.2 in the textbook for why, 
# and how to prevent

rg = default_rng()
myray = rg.standard_normal(5)
print("myray:", myray) 

myray: [-1.06129318 -2.15400282 -1.21367737  0.11830619 -0.32750195]


In [3]:
# q1 - indexing, pick the odd elements
myray[1::2]

array([-2.15400282,  0.11830619])

Booleans arrays: Asking a logic test on an array, returns the array, where each element has been tested against that logic and converted to the boolean answer.

In [4]:
# q2 - (a) boolean array + (b) "masking"

# (a)
myray > 0

array([False, False, False,  True, False])

In [5]:
# (b)
mask = myray > 0 
myray[mask] 
# or myray[myray>0]

array([0.11830619])

## The main event - Pandas

Vocab
- series
- index 
- dataframe
- columns and names
- rows and index 
- multiindex 

In [6]:
start = datetime.datetime(2017, 1, 1) # you can specify start and end dates this way
end = datetime.datetime(2021, 1, 27)
macro_df = pdr.data.DataReader(['GDP','CPIAUCSL','UNRATE'], 'fred', start, end)


In [7]:
# preview the dataframe
macro_df

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,19153.912,243.620,4.7
2017-02-01,,243.872,4.6
2017-03-01,,243.766,4.4
2017-04-01,19322.920,244.274,4.4
2017-05-01,,244.069,4.4
...,...,...,...
2020-09-01,,260.149,7.9
2020-10-01,21477.597,260.462,6.9
2020-11-01,,260.927,6.7
2020-12-01,,261.560,6.7


In [8]:
# shape
macro_df.shape # no paren, bc shape is attribute, not method

(49, 3)

In [9]:
# variable types
macro_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 49 entries, 2017-01-01 to 2021-01-01
Freq: MS
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   GDP       17 non-null     float64
 1   CPIAUCSL  49 non-null     float64
 2   UNRATE    49 non-null     float64
dtypes: float64(3)
memory usage: 1.5 KB


In [10]:
# look at top X rows
macro_df.head(3)

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,19153.912,243.62,4.7
2017-02-01,,243.872,4.6
2017-03-01,,243.766,4.4


In [11]:
# look at bottom X rows
macro_df.tail(7)

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-07-01,21138.574,258.604,10.2
2020-08-01,,259.511,8.4
2020-09-01,,260.149,7.9
2020-10-01,21477.597,260.462,6.9
2020-11-01,,260.927,6.7
2020-12-01,,261.56,6.7
2021-01-01,22038.226,262.231,6.4


In [12]:
# grab one variable
macro_df['GDP']+macro_df['UNRATE'] # bracket, string var name

DATE
2017-01-01    19158.612
2017-02-01          NaN
2017-03-01          NaN
2017-04-01    19327.320
2017-05-01          NaN
                ...    
2020-09-01          NaN
2020-10-01    21484.497
2020-11-01          NaN
2020-12-01          NaN
2021-01-01    22044.626
Freq: MS, Length: 49, dtype: float64

In [13]:
macro_df.iloc[1::2,2]

DATE
2017-02-01     4.6
2017-04-01     4.4
2017-06-01     4.3
2017-08-01     4.4
2017-10-01     4.2
              ... 
2020-04-01    14.7
2020-06-01    11.0
2020-08-01     8.4
2020-10-01     6.9
2020-12-01     6.7
Freq: 2MS, Name: UNRATE, Length: 24, dtype: float64

In [14]:
# grab two (or more) variables
macro_df[  ['GDP','UNRATE']     ] # TWO bracket (list of var names)

Unnamed: 0_level_0,GDP,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01,19153.912,4.7
2017-02-01,,4.6
2017-03-01,,4.4
2017-04-01,19322.920,4.4
2017-05-01,,4.4
...,...,...
2020-09-01,,7.9
2020-10-01,21477.597,6.9
2020-11-01,,6.7
2020-12-01,,6.7


In [15]:
# see column names
macro_df.columns

Index(['GDP', 'CPIAUCSL', 'UNRATE'], dtype='object')

In [16]:
# change column names
macro_df.columns = ['G','C','U']
macro_df.columns

Index(['G', 'C', 'U'], dtype='object')

In [17]:
# lets change the column names back
macro_df.columns = ['GDP', 'CPIAUCSL', 'UNRATE']

In [18]:
# see index
macro_df.index

DatetimeIndex(['2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01',
               '2017-05-01', '2017-06-01', '2017-07-01', '2017-08-01',
               '2017-09-01', '2017-10-01', '2017-11-01', '2017-12-01',
               '2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01',
               '2018-05-01', '2018-06-01', '2018-07-01', '2018-08-01',
               '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01',
               '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01',
               '2019-05-01', '2019-06-01', '2019-07-01', '2019-08-01',
               '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
               '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01',
               '2020-05-01', '2020-06-01', '2020-07-01', '2020-08-01',
               '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01',
               '2021-01-01'],
              dtype='datetime64[ns]', name='DATE', freq='MS')

In [19]:
# reset_index() and set_index()
macro_df.reset_index()  # turns index into var

Unnamed: 0,DATE,GDP,CPIAUCSL,UNRATE
0,2017-01-01,19153.912,243.620,4.7
1,2017-02-01,,243.872,4.6
2,2017-03-01,,243.766,4.4
3,2017-04-01,19322.920,244.274,4.4
4,2017-05-01,,244.069,4.4
...,...,...,...,...
44,2020-09-01,,260.149,7.9
45,2020-10-01,21477.597,260.462,6.9
46,2020-11-01,,260.927,6.7
47,2020-12-01,,261.560,6.7


In [20]:
macro_df.reset_index().set_index('DATE') # turn var into index

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,19153.912,243.620,4.7
2017-02-01,,243.872,4.6
2017-03-01,,243.766,4.4
2017-04-01,19322.920,244.274,4.4
2017-05-01,,244.069,4.4
...,...,...,...
2020-09-01,,260.149,7.9
2020-10-01,21477.597,260.462,6.9
2020-11-01,,260.927,6.7
2020-12-01,,261.560,6.7


In [21]:
# grab some rows (by position)
macro_df[4:8]

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-05-01,,244.069,4.4
2017-06-01,,244.218,4.3
2017-07-01,19558.693,244.28,4.3
2017-08-01,,245.205,4.4


In [22]:
# grab some rows (by value)
mask = macro_df['UNRATE'] > 6
macro_df[mask]

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-01,19477.444,256.192,14.7
2020-05-01,,255.942,13.2
2020-06-01,,257.282,11.0
2020-07-01,21138.574,258.604,10.2
2020-08-01,,259.511,8.4
2020-09-01,,260.149,7.9
2020-10-01,21477.597,260.462,6.9
2020-11-01,,260.927,6.7
2020-12-01,,261.56,6.7
2021-01-01,22038.226,262.231,6.4


In [24]:
# create a variable
# (2 ways)
# create var AND NAME IT
macro_df['HIGH'] = macro_df['UNRATE'] > 6
macro_df.assign(low = macro_df['UNRATE'] < 4)
macro_df 

# notice that "low" isn't in the df!
# see 3.2.4 for why
# to permanently save it, add 'df = ' to the line:
macro_df = macro_df.assign(low = macro_df['UNRATE'] < 4)
macro_df

Unnamed: 0_level_0,GDP,CPIAUCSL,UNRATE,HIGH,low
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01,19153.912,243.620,4.7,False,False
2017-02-01,,243.872,4.6,False,False
2017-03-01,,243.766,4.4,False,False
2017-04-01,19322.920,244.274,4.4,False,False
2017-05-01,,244.069,4.4,False,False
...,...,...,...,...,...
2020-09-01,,260.149,7.9,True,False
2020-10-01,21477.597,260.462,6.9,True,False
2020-11-01,,260.927,6.7,True,False
2020-12-01,,261.560,6.7,True,False


## EDA

Stop here. Back to the lecture. 

### Part 1

Q0: Do each of the [EDA golden rules for initial data exploration](https://ledatascifi.github.io/ledatascifi-2022/content/03/02e_eda_golden.html) and write down your observations.
- **Important: What is the "key" or "unit level" that this database describes?** 
    - [This is discussed in 3.2.2.2 with examples](https://ledatascifi.github.io/ledatascifi-2022/content/03/02b_pandasVocab.html#the-shape-of-data)
    - The "key" levels in databases we will look at are often increments of time in the data, the type of entity describe (e.g. firm, person, state, country, industry), and combinations of entity and time. 
    - E.g. "firm" level, "firm-year" level

Q1: What is the second series above?

Q2: What is the frequency of the series?

Q3: What is the average ANNUAL GDP, based on the data?

### Some answers
- Q0: What have we learned about the data? Anything to keep track of? (GDP is annual, others are quarterly)
- Q1: Inflation (CPI)
- Q2: Quarterly, but GDP is only annual
- Q3: 20,630 (trillion)

In [None]:
# do your work below this function

def insufficient_but_starting_eda(df,cat_vars_list=None):
    '''
    
    Parameters
    ----------
    df : DATAFRAME
    cat_vars_list : LIST, optional
        A list of strings containing variable names in the dataframe
        for variables where you want to see the number of unique values
        and the 10 most common values. Likely used for categorical values.

    Returns
    -------
    None. It simply prints.
    
    Description
    -------    
    This function will print a MINIMUM amount of info about a new dataframe. 
    
    You should ****look**** at all this output below and consider the data
    exploration and cleaning questions from 
    https://ledatascifi.github.io/ledatascifi-2021/content/03/02e_eda_golden.html#member
    
    Also LOOK at more of the data manually. 
    
    Then write up anything notable you observe.
    
    TIP: put this function in your codebook to reuse easily.
    
    PROTIP: Improve this function (better outputs, better formatting).
    
    FEATURE REQUEST: optionally print the nunique and top 10 values under the describe matrix
    
    FEATURE REQUEST: optionally print more stats (percentiles)
    
    '''
    print(df.head(),  '\n---')
    print(df.tail(),  '\n---')
    print(df.columns, '\n---')
    print("The shape is: ",df.shape, '\n---')
    print("Info:",df.info(), '\n---') # memory usage, name, dtype, and # of non-null obs (--> # of missing obs) per variable
    print(df.describe(), '\n---') # summary stats, and you can customize the list!
    if cat_vars_list != None:
        for var in cat_vars_list:
            print(var,"has",df[var].nunique(),"values and its top 10 most common are:")
            print(df[var].value_counts().head(10), '\n---')
        
insufficient_but_starting_eda(macro_df,['UNRATE'])  

In [None]:
insufficient_but_starting_eda(macro_df,['UNRATE'])  

## Part 2

Q4: Download the annual *real* gdp from 1960 to 2018 from FRED and compute the average annual percent change