In [41]:
import pandas as pd
import numpy as np

### 1. Appending new rows to DataFrames

In [42]:
names = pd.read_csv('data/names.csv')
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [43]:
new_data_list = ['Aria', 1]
new_data_list

['Aria', 1]

In [44]:
names.loc[4] = new_data_list
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [45]:
names.loc['five'] = ['Zach', 3]
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3


In [46]:
names.loc[len(names)] = {'Name':'Zayd', 'Age':2}
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2


In [47]:
names.loc[len(names)] = pd.Series({'Age':32,
 'Name':'Dean'})
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2
7,Dean,32


In [48]:
names = pd.read_csv('data/names.csv')
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [49]:
# names.append({'Name':'Aria', 'Age':1}) throws error
names.append({'Name':'Aria', 'Age':1}, ignore_index = True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [50]:
names.append({'Name':'Aria', 'Age':100}, ignore_index = True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,100


In [51]:
s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))

In [52]:
s

Name    Zach
Age        3
Name: 4, dtype: object

In [53]:
pd.DataFrame(s)

Unnamed: 0,4
Name,Zach
Age,3


In [54]:
s = pd.Series({'Name': 'Zach', 'Age': 3}, name='Apple')

In [55]:
s

Name    Zach
Age        3
Name: Apple, dtype: object

In [56]:
pd.DataFrame(s)

Unnamed: 0,Apple
Name,Zach
Age,3


* Conclusion:\
* Append is better than inserting with .iloc an d.loc methods \
* .loc will replace an existing index if the new one is same \
* .append will never modify the existing dataframe

In [57]:
bball_16 = pd.read_csv('data/baseball16.csv')
bball_16.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,altuvjo01,2016,1,HOU,AL,161,640,108,216,42,...,96.0,30.0,10.0,60,70.0,11.0,7.0,3.0,7.0,15.0
1,bregmal01,2016,1,HOU,AL,49,201,31,53,13,...,34.0,2.0,0.0,15,52.0,0.0,0.0,0.0,1.0,1.0
2,castrja01,2016,1,HOU,AL,113,329,41,69,16,...,32.0,2.0,1.0,45,123.0,0.0,1.0,1.0,0.0,9.0
3,correca01,2016,1,HOU,AL,153,577,76,158,36,...,96.0,13.0,3.0,75,139.0,5.0,5.0,0.0,3.0,12.0
4,gattiev01,2016,1,HOU,AL,128,447,58,112,19,...,72.0,2.0,1.0,43,127.0,6.0,4.0,0.0,5.0,12.0


In [58]:
import pprint
data_dict = bball_16.iloc[0].to_dict()
pprint.pprint(data_dict)

{'2B': 42,
 '3B': 5,
 'AB': 640,
 'BB': 60,
 'CS': 10.0,
 'G': 161,
 'GIDP': 15.0,
 'H': 216,
 'HBP': 7.0,
 'HR': 24,
 'IBB': 11.0,
 'R': 108,
 'RBI': 96.0,
 'SB': 30.0,
 'SF': 7.0,
 'SH': 3.0,
 'SO': 70.0,
 'lgID': 'AL',
 'playerID': 'altuvjo01',
 'stint': 1,
 'teamID': 'HOU',
 'yearID': 2016}


In [59]:
new_data_dict = {k: '' if isinstance(v, str) else
 np.nan for k, v in data_dict.items()}
pprint.pprint(new_data_dict)

{'2B': nan,
 '3B': nan,
 'AB': nan,
 'BB': nan,
 'CS': nan,
 'G': nan,
 'GIDP': nan,
 'H': nan,
 'HBP': nan,
 'HR': nan,
 'IBB': nan,
 'R': nan,
 'RBI': nan,
 'SB': nan,
 'SF': nan,
 'SH': nan,
 'SO': nan,
 'lgID': '',
 'playerID': '',
 'stint': nan,
 'teamID': '',
 'yearID': nan}


In [60]:
# Append is fatser than adding data using a for loop

In [61]:
random_data = []
for i in range(1000):
    d = dict()
    for k, v in data_dict.items():
        if isinstance(v, str):
            d[k] = np.random.choice(list('abcde'))
        else:
            d[k] = np.random.randint(10)
    random_data.append(pd.Series(d, name=i + len(bball_16)))

In [62]:
random_data[0].head()

playerID    c
yearID      9
stint       7
teamID      c
lgID        d
Name: 16, dtype: object

In [63]:
%%timeit
bball_16_copy = bball_16.copy()
for row in random_data:
    bball_16_copy = bball_16_copy.append(row)

4.72 s ± 47.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [64]:
%%timeit
bball_16_copy = bball_16.copy()
bball_16_copy = bball_16_copy.append(random_data)

49.6 ms ± 801 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [65]:
# Pandas internally creates a dataframe for all the series items 
# and then does only one append

### 2. Concatenating multiple DataFrames together

In [66]:
stocks_2016 = pd.read_csv('data/stocks_2016.csv',
 index_col='Symbol')


In [67]:
stocks_2017 = pd.read_csv('data/stocks_2017.csv',
 index_col='Symbol')



In [68]:
s_list = [stocks_2016, stocks_2017]

In [69]:
pd.concat(s_list)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [70]:
pd.concat(s_list, keys=['2016', '2017'],
 names=['Year', 'Symbol'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Year,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300


In [71]:
pd.concat(s_list, keys=['2016', '2017'],
 axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
WMT,40.0,55.0,70.0,,,
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TXN,,,,500.0,15.0,23.0


In [72]:
pd.concat(s_list, join='inner', keys=['2016', '2017'],
 axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


In [73]:
stocks_2016.append(stocks_2017)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


# KEYS and NAMES 

In [74]:
pd.concat(s_list, keys=['2016', '2017'],
 names=['Year', 'Symbol'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Year,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300


# default cocnat is outer join

In [75]:
pd.concat(s_list, keys=['2016', '2017'],
 axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
WMT,40.0,55.0,70.0,,,
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TXN,,,,500.0,15.0,23.0


In [76]:
pd.concat(s_list, join='inner', keys=['2016', '2017'],
axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


# append internally calls concat

In [77]:
stocks_2016.append(stocks_2017)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


### 3. Comparing President Trump's and Obama's approval ratings

In [79]:
# base_url = 'http://www.presidency.ucsb.edu/data/popularity.php?pres={}'
# trump_url = base_url.format(45)
# df_list = pd.read_html(base_url)
# len(df_list)

In [80]:
base_url = 'https://docs.google.com/spreadsheets/d/1iEl565M1mICTubTtoxXMdxzaHzAcPTnb3kpRndsrfyY/edit?ts=5bd7f609#gid=671375968'

In [81]:
df_list = pd.read_html(base_url, header=1)

In [82]:
len(df_list)

1

In [83]:
type(df_list)

list

In [84]:
type(df_list[0])

pandas.core.frame.DataFrame

In [85]:
df_list[0].columns

Index(['1', 'Start Date', 'End Date', 'Approving', 'Disapproving',
       'Unsure/NoData', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17'],
      dtype='object')

In [86]:
df_list[0].index

RangeIndex(start=0, stop=99, step=1)

In [87]:
df_list[0].shape

(99, 18)

In [88]:
 df0 = df_list[0]

In [89]:
df0.shape

(99, 18)

In [90]:
df0.head(7)

Unnamed: 0,1,Start Date,End Date,Approving,Disapproving,Unsure/NoData,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,2,1/16/17,1/19/2017,59,37,4,,,,,,,,,,,,
1,3,1/9/17,1/15/2017,57,39,4,,,,,,,,,,,,
2,4,1/2/17,1/8/2017,55,42,3,,,,,,,,,,,,
3,5,12/26/16,1/1/2017,55,40,5,,,,,,,,,,,,
4,6,12/19/16,12/25/2016,56,40,4,,,,,,,,,,,,
5,7,12/12/16,12/18/2016,56,40,4,,,,,,,,,,,,
6,8,12/5/16,12/11/2016,57,40,3,,,,,,,,,,,,


In [91]:
# df_list = pd.read_html(trump_url, match='Start Date')
# #dataset issue

### 4. Understanding the differences between concat, join, and merge

In [92]:
from IPython.display import display_html
years = 2016, 2017, 2018
stock_tables = [pd.read_csv('data/stocks_{}.csv'.format(year), index_col='Symbol') for year in years]
def display_frames(frames, num_spaces=0):
    t_style = '<table style="display: inline;"'
    tables_html = [df.to_html().replace('<table', t_style) for df in frames]
    space = '&nbsp;' * num_spaces
    display_html(space.join(tables_html), raw=True)
display_frames(stock_tables, 30)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,40,135,170
AMZN,8,900,1125
TSLA,50,220,400


In [93]:
stocks_2016, stocks_2017, stocks_2018 = stock_tables

In [94]:
stock_tables

[        Shares  Low  High
 Symbol                   
 AAPL        80   95   110
 TSLA        50   80   130
 WMT         40   55    70,
         Shares  Low  High
 Symbol                   
 AAPL        50  120   140
 GE         100   30    40
 IBM         87   75    95
 SLB         20   55    85
 TXN        500   15    23
 TSLA       100  100   300,
         Shares  Low  High
 Symbol                   
 AAPL        40  135   170
 AMZN         8  900  1125
 TSLA        50  220   400]

In [95]:
stocks_2016

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70


In [96]:
stocks_2017

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [97]:
stocks_2018

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,40,135,170
AMZN,8,900,1125
TSLA,50,220,400


In [98]:
pd.concat(stock_tables, keys=[2016, 2017, 2018])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Unnamed: 0_level_1,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300
2018,AAPL,40,135,170


In [99]:
pd.concat(dict(zip(years,stock_tables)), axis='columns')

Unnamed: 0_level_0,2016,2016,2016,2017,2017,2017,2018,2018,2018
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High,Shares,Low,High
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [100]:
stocks_2016.join(stocks_2017, lsuffix='_2016',
rsuffix='_2017', how='outer')

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [101]:
other = [stocks_2017.add_suffix('_2017'),
 stocks_2018.add_suffix('_2018')]

In [102]:
stocks_2016.add_suffix('_2016').join(other, how='outer')

Unnamed: 0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017,Shares_2018,Low_2018,High_2018
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [142]:
stock_join = stocks_2016.add_suffix('_2016').join(other,
 how='outer')

In [143]:
stock_join

Unnamed: 0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017,Shares_2018,Low_2018,High_2018
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [104]:
stock_concat = pd.concat(dict(zip(years,stock_tables)),
 axis='columns')

In [106]:
stock_concat

Unnamed: 0_level_0,2016,2016,2016,2017,2017,2017,2018,2018,2018
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High,Shares,Low,High
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [107]:
level_1 = stock_concat.columns.get_level_values(1)

In [108]:
level_1

Index(['Shares', 'Low', 'High', 'Shares', 'Low', 'High', 'Shares', 'Low',
       'High'],
      dtype='object')

In [109]:
level_0 = stock_concat.columns.get_level_values(0).astype(str)
level_0

Index(['2016', '2016', '2016', '2017', '2017', '2017', '2018', '2018', '2018'], dtype='object')

In [110]:
stock_concat.columns = level_1 + '_' + level_0

In [112]:
stock_join.equals(stock_concat)

True

In [113]:
stocks_2016.merge(stocks_2017, left_index=True,
 right_index=True)

Unnamed: 0_level_0,Shares_x,Low_x,High_x,Shares_y,Low_y,High_y
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


In [115]:
step1 = stocks_2016.merge(stocks_2017, left_index=True,
 right_index=True, how='outer',
 suffixes=('_2016', '_2017'))
step1

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [116]:
stock_merge = step1.merge(stocks_2018.add_suffix('_2018'),
 left_index=True, right_index=True,
 how='outer')
stock_merge

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017,Shares_2018,Low_2018,High_2018
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
AMZN,,,,,,,8.0,900.0,1125.0
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
TXN,,,,500.0,15.0,23.0,,,
WMT,40.0,55.0,70.0,,,,,,


In [117]:
stock_concat.equals(stock_merge)

False

In [119]:
names = ['prices', 'transactions']
food_tables = [pd.read_csv('data/food_{}.csv'.format(name)) for name in names]
food_prices, food_transactions = food_tables

In [123]:
food_prices

Unnamed: 0,item,store,price,Date
0,pear,A,0.99,2017
1,pear,B,1.99,2017
2,peach,A,2.99,2017
3,peach,B,3.49,2017
4,banana,A,0.39,2017
5,banana,B,0.49,2017
6,steak,A,5.99,2017
7,steak,B,6.99,2017
8,steak,B,4.99,2015


In [124]:
food_transactions

Unnamed: 0,custid,item,store,quantity
0,1,pear,A,5
1,1,banana,A,10
2,2,steak,B,3
3,2,pear,B,1
4,2,peach,B,2
5,2,steak,B,1
6,2,coconut,B,4


In [125]:
display_frames(food_tables, 30)

Unnamed: 0,item,store,price,Date
0,pear,A,0.99,2017
1,pear,B,1.99,2017
2,peach,A,2.99,2017
3,peach,B,3.49,2017
4,banana,A,0.39,2017
5,banana,B,0.49,2017
6,steak,A,5.99,2017
7,steak,B,6.99,2017
8,steak,B,4.99,2015

Unnamed: 0,custid,item,store,quantity
0,1,pear,A,5
1,1,banana,A,10
2,2,steak,B,3
3,2,pear,B,1
4,2,peach,B,2
5,2,steak,B,1
6,2,coconut,B,4


In [126]:
food_transactions.merge(food_prices, on=['item', 'store'])

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017
1,1,banana,A,10,0.39,2017
2,2,steak,B,3,6.99,2017
3,2,steak,B,3,4.99,2015
4,2,steak,B,1,6.99,2017
5,2,steak,B,1,4.99,2015
6,2,pear,B,1,1.99,2017
7,2,peach,B,2,3.49,2017


In [127]:
food_transactions.merge(food_prices.query('Date == 2017'),
 how='left')

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [130]:
food_prices_join = food_prices.query('Date == 2017') \
 .set_index(['item', 'store'])

In [131]:
food_prices_join

Unnamed: 0_level_0,Unnamed: 1_level_0,price,Date
item,store,Unnamed: 2_level_1,Unnamed: 3_level_1
pear,A,0.99,2017
pear,B,1.99,2017
peach,A,2.99,2017
peach,B,3.49,2017
banana,A,0.39,2017
banana,B,0.49,2017
steak,A,5.99,2017
steak,B,6.99,2017


In [132]:
food_prices_join

Unnamed: 0_level_0,Unnamed: 1_level_0,price,Date
item,store,Unnamed: 2_level_1,Unnamed: 3_level_1
pear,A,0.99,2017
pear,B,1.99,2017
peach,A,2.99,2017
peach,B,3.49,2017
banana,A,0.39,2017
banana,B,0.49,2017
steak,A,5.99,2017
steak,B,6.99,2017


In [133]:
food_transactions.join(food_prices_join, on=['item', 'store'])

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [135]:
# pd.concat([food_transactions.set_index(['item', 'store']),
# food_prices.set_index(['item', 'store'])],
# axis='columns')

In [136]:
import glob
df_list = []
for filename in glob.glob('data/gas prices/*.csv'):
    df_list.append(pd.read_csv(filename, index_col='Week',
    parse_dates=['Week']))

In [138]:
gas = pd.concat(df_list, axis='columns')

In [141]:
gas.head()

Unnamed: 0_level_0,All Grades,Diesel,Midgrade,Premium,Regular
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-25,2.701,2.788,2.859,3.105,2.583
2017-09-18,2.75,2.791,2.906,3.151,2.634
2017-09-11,2.8,2.802,2.953,3.197,2.685
2017-09-04,2.794,2.758,2.946,3.191,2.679
2017-08-28,2.513,2.605,2.668,2.901,2.399
