# Pandas: the Python structured data library

Pandas (allegedly) stands for **Pan**el **da**ta (**s**?) and lets you manipulate 'spreadsheet-like' data in Python easily

In [1]:
!pip install pandas

Looking in links: /home/rick446/src/wheelhouse
You should consider upgrading via the '/home/rick446/.virtualenvs/classes/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd

## Series: kind of like a `list` and `dict` put together

In [3]:
s = pd.Series([1,2,3])
s

0    1
1    2
2    3
dtype: int64

In [4]:
import numpy as np

s = pd.Series([1,2,3], dtype=np.int8)
s

0    1
1    2
2    3
dtype: int8

In [5]:
s[1] = 3.14
s

0    1
1    3
2    3
dtype: int8

In [6]:
'a b c'.split()

['a', 'b', 'c']

In [7]:
s = pd.Series([1,2,3], index='a b c'.split())
s

a    1
b    2
c    3
dtype: int64

In [8]:
s[0]

1

In [9]:
s['a']

1

## DataFrame -- the main data type



In [10]:
df = pd.DataFrame(
    [
        [1,2,3],
        [4,5,6],
        [7,8,9],
        [7,8,9],
    ],
    columns='a b c'.split(),
    index='x y z w'.split()
)
df

Unnamed: 0,a,b,c
x,1,2,3
y,4,5,6
z,7,8,9
w,7,8,9


In [11]:
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [12]:
df.index

Index(['x', 'y', 'z', 'w'], dtype='object')

In [13]:
df['a']

x    1
y    4
z    7
w    7
Name: a, dtype: int64

In [14]:
df.a

x    1
y    4
z    7
w    7
Name: a, dtype: int64

Multiple columns

In [15]:
cola = pd.Series([1, 4, 7], name='ca')
colb = pd.Series([2, 5, 8], name='cb')
colc = pd.Series([3, 6, 9], name='cc', dtype=np.float32)
pd.DataFrame({'a': cola, 'b': colb, 'c': colc})

Unnamed: 0,a,b,c
0,1,2,3.0
1,4,5,6.0
2,7,8,9.0


In [16]:
df

Unnamed: 0,a,b,c
x,1,2,3
y,4,5,6
z,7,8,9
w,7,8,9


In [17]:
cols = ['a', 'b']
df_test = df[cols]
df_test

Unnamed: 0,a,b
x,1,2
y,4,5
z,7,8
w,7,8


In [18]:
cols = ['a']
df_test = df[cols]
df_test

Unnamed: 0,a
x,1
y,4
z,7
w,7


In [19]:
df_test = df[['a']]
df_test

Unnamed: 0,a
x,1
y,4
z,7
w,7


In [20]:
df_test = df['a']
df_test

x    1
y    4
z    7
w    7
Name: a, dtype: int64

Indexing ambiguity

In [21]:
s = pd.Series([1,2,3], index=[1,2,3])
s

1    1
2    2
3    3
dtype: int64

In [22]:
s[1]  # label/index value

1

In [23]:
s[1:3]  # position/offset

2    2
3    3
dtype: int64

# Indexing using .loc, .iloc

In [24]:
s.loc[1]

1

In [25]:
s.iloc[1]

2

In [26]:
df

Unnamed: 0,a,b,c
x,1,2,3
y,4,5,6
z,7,8,9
w,7,8,9


In [27]:
df['a']

x    1
y    4
z    7
w    7
Name: a, dtype: int64

In [28]:
df.loc['x']

a    1
b    2
c    3
Name: x, dtype: int64

In [29]:
df.iloc[0]

a    1
b    2
c    3
Name: x, dtype: int64

In [30]:
df.loc['x', 'a']

1

In [31]:
df.loc['x', :]  # retrieve all columns

a    1
b    2
c    3
Name: x, dtype: int64

In [32]:
df.loc[:, 'a']  # retrieve all rows

x    1
y    4
z    7
w    7
Name: a, dtype: int64

In [33]:
df.loc['x':'y']   # includes both endpoints (df.loc[x] and df.loc[y])

Unnamed: 0,a,b,c
x,1,2,3
y,4,5,6


In [34]:
df.iloc[0:2]     # excludes the right endpoint (df.iloc[2])

Unnamed: 0,a,b,c
x,1,2,3
y,4,5,6


In [39]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, x to w
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       4 non-null      int64
 1   b       4 non-null      int64
 2   c       4 non-null      int64
dtypes: int64(3)
memory usage: 488.0 bytes


In [40]:
import sys
sys.getsizeof(5)

28

In [41]:
df.loc['x', 'b'] = 200

In [42]:
df

Unnamed: 0,a,b,c
x,1,200,3
y,4,5,6
z,7,8,9
w,7,8,9


In [43]:
df['a']

x    1
y    4
z    7
w    7
Name: a, dtype: int64

In [44]:
df2 = df[['a']]
df2

Unnamed: 0,a
x,1
y,4
z,7
w,7


In [45]:
df2.shape

(4, 1)

In [46]:
df['a']

x    1
y    4
z    7
w    7
Name: a, dtype: int64

In [47]:
df['a'].shape

(4,)

# Reading CSV data

Most of the time, we *won't* be building `DataFrame`s out of the basic constructor, but rather using one of the readers built in to Pandas. One of these is `read_csv`:

In [51]:
df = pd.read_csv('./data/closing-prices.csv')
df.head() # Only show the first few rows  aka df.iloc[:5]

Unnamed: 0.1,Unnamed: 0,F,TSLA,GOOG,IBM,AAPL
0,2014-01-02,12.089,150.1,,157.6001,72.7741
1,2014-01-03,12.1438,149.56,,158.543,71.1756
2,2014-01-06,12.1986,147.0,,157.9993,71.5637
3,2014-01-07,12.042,149.36,,161.1508,71.0516
4,2014-01-08,12.1673,151.28,,159.6728,71.5019


In [52]:
!head data/closing-prices.csv

,F,TSLA,GOOG,IBM,AAPL
2014-01-02,12.089,150.1,,157.6001,72.7741
2014-01-03,12.1438,149.56,,158.543,71.1756
2014-01-06,12.1986,147.0,,157.9993,71.5637
2014-01-07,12.042,149.36,,161.1508,71.0516
2014-01-08,12.1673,151.28,,159.6728,71.5019
2014-01-09,12.4022,147.53,,159.1716,70.5887
2014-01-10,12.5822,145.7199,,159.0696,70.1178
2014-01-13,12.6136,139.34,,156.4363,70.4849
2014-01-14,12.8406,161.27,,157.9314,71.8874


The CSV reader is pretty good about inferring types, but not perfect. We can check lots of things about the structure of a `DataFrame` with the `.info()` method:

In [53]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1007 non-null   object 
 1   F           1007 non-null   float64
 2   TSLA        1007 non-null   float64
 3   GOOG        949 non-null    float64
 4   IBM         1007 non-null   float64
 5   AAPL        1007 non-null   float64
dtypes: float64(5), object(1)
memory usage: 105.3 KB


In [54]:
ls -l data/closing-prices.csv

-rw-r--r-- 1 rick446 rick446 50156 Sep 11  2020 data/closing-prices.csv


In [55]:
float('nan')

nan

In [59]:
np.nan == np.nan

False

In [57]:
np.nan is np.nan

True

In [60]:
!ls -lh ./data/closing-prices.csv

-rw-r--r-- 1 rick446 rick446 49K Sep 11  2020 ./data/closing-prices.csv


In [61]:
import csv
with open('./data/closing-prices.csv') as f:
    rows = list(csv.reader(f))

In [62]:
len(rows)

1008

In [63]:
rows[:5]

[['', 'F', 'TSLA', 'GOOG', 'IBM', 'AAPL'],
 ['2014-01-02', '12.089', '150.1', '', '157.6001', '72.7741'],
 ['2014-01-03', '12.1438', '149.56', '', '158.543', '71.1756'],
 ['2014-01-06', '12.1986', '147.0', '', '157.9993', '71.5637'],
 ['2014-01-07', '12.042', '149.36', '', '161.1508', '71.0516']]

In [64]:
!pip install pympler

Looking in links: /home/rick446/src/wheelhouse
You should consider upgrading via the '/home/rick446/.virtualenvs/classes/bin/python -m pip install --upgrade pip' command.[0m


In [65]:
import pympler

In [66]:
import pympler.asizeof

In [67]:
pympler.asizeof.asizeof(rows)

485176

The first column was read in as an `object` (meaning Pandas couldn't be more specific about its type, usually what happens with string data). Let's tell Pandas that column is a date:

In [68]:
pd.to_datetime(df['Unnamed: 0'])

0      2014-01-02
1      2014-01-03
2      2014-01-06
3      2014-01-07
4      2014-01-08
          ...    
1002   2017-12-22
1003   2017-12-26
1004   2017-12-27
1005   2017-12-28
1006   2017-12-29
Name: Unnamed: 0, Length: 1007, dtype: datetime64[ns]

In [69]:
df['Unnamed: 0'] = pd.to_datetime(df['Unnamed: 0'])
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  1007 non-null   datetime64[ns]
 1   F           1007 non-null   float64       
 2   TSLA        1007 non-null   float64       
 3   GOOG        949 non-null    float64       
 4   IBM         1007 non-null   float64       
 5   AAPL        1007 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 47.3 KB


We can also parse datetimes during the import:

In [70]:
df = pd.read_csv('./data/closing-prices.csv', parse_dates=[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  1007 non-null   datetime64[ns]
 1   F           1007 non-null   float64       
 2   TSLA        1007 non-null   float64       
 3   GOOG        949 non-null    float64       
 4   IBM         1007 non-null   float64       
 5   AAPL        1007 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 47.3 KB


In [71]:
ls -lh ./data/closing-prices.csv

-rw-r--r-- 1 rick446 rick446 49K Sep 11  2020 ./data/closing-prices.csv


In [72]:
df.iloc[:5]

Unnamed: 0.1,Unnamed: 0,F,TSLA,GOOG,IBM,AAPL
0,2014-01-02,12.089,150.1,,157.6001,72.7741
1,2014-01-03,12.1438,149.56,,158.543,71.1756
2,2014-01-06,12.1986,147.0,,157.9993,71.5637
3,2014-01-07,12.042,149.36,,161.1508,71.0516
4,2014-01-08,12.1673,151.28,,159.6728,71.5019


In [73]:
df.tail()

Unnamed: 0.1,Unnamed: 0,F,TSLA,GOOG,IBM,AAPL
1002,2017-12-22,11.9489,325.2,1060.12,147.7588,173.023
1003,2017-12-26,11.9679,317.29,1056.74,148.0786,168.6334
1004,2017-12-27,11.8729,311.64,1049.37,148.3693,168.663
1005,2017-12-28,11.9489,315.36,1048.14,149.251,169.1376
1006,2017-12-29,11.8634,311.35,1046.4,148.6502,167.3086


We can set the index of the dataframe as well:

In [78]:
df = df.set_index('Unnamed: 0')  # also df.set_index('Unnamed: 0', inplace=True)
df.head()

Unnamed: 0_level_0,F,TSLA,GOOG,IBM,AAPL
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-02,12.089,150.1,,157.6001,72.7741
2014-01-03,12.1438,149.56,,158.543,71.1756
2014-01-06,12.1986,147.0,,157.9993,71.5637
2014-01-07,12.042,149.36,,161.1508,71.0516
2014-01-08,12.1673,151.28,,159.6728,71.5019


In [79]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1007 entries, 2014-01-02 to 2017-12-29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       1007 non-null   float64
 1   TSLA    1007 non-null   float64
 2   GOOG    949 non-null    float64
 3   IBM     1007 non-null   float64
 4   AAPL    1007 non-null   float64
dtypes: float64(5)
memory usage: 47.2 KB


Its even better if we do it when we read in the frame:

In [80]:
df = pd.read_csv('./data/closing-prices.csv', index_col=0, parse_dates=[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1007 entries, 2014-01-02 to 2017-12-29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       1007 non-null   float64
 1   TSLA    1007 non-null   float64
 2   GOOG    949 non-null    float64
 3   IBM     1007 non-null   float64
 4   AAPL    1007 non-null   float64
dtypes: float64(5)
memory usage: 47.2 KB


In [81]:
df.head()

Unnamed: 0,F,TSLA,GOOG,IBM,AAPL
2014-01-02,12.089,150.1,,157.6001,72.7741
2014-01-03,12.1438,149.56,,158.543,71.1756
2014-01-06,12.1986,147.0,,157.9993,71.5637
2014-01-07,12.042,149.36,,161.1508,71.0516
2014-01-08,12.1673,151.28,,159.6728,71.5019


In [82]:
df.loc['1/3/14']

F        12.1438
TSLA    149.5600
GOOG         NaN
IBM     158.5430
AAPL     71.1756
Name: 2014-01-03 00:00:00, dtype: float64

In [83]:
df.loc['Jan 3 2014']

F        12.1438
TSLA    149.5600
GOOG         NaN
IBM     158.5430
AAPL     71.1756
Name: 2014-01-03 00:00:00, dtype: float64

In [84]:
df.iloc[1]

F        12.1438
TSLA    149.5600
GOOG         NaN
IBM     158.5430
AAPL     71.1756
Name: 2014-01-03 00:00:00, dtype: float64

In [85]:
!cp ./data/closing-prices.csv ./data/closing-prices-2.csv
!gzip -f ./data/closing-prices-2.csv

In [86]:
!ls -lh ./data/closing-prices-2.csv.gz

-rw-r--r-- 1 rick446 rick446 20K Nov  3 10:26 ./data/closing-prices-2.csv.gz


In [87]:
df = pd.read_csv(
    './data/closing-prices-2.csv.gz', 
    index_col=0, 
    parse_dates=[0], 
    dtype=np.float16,
)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1007 entries, 2014-01-02 to 2017-12-29
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       1007 non-null   float16
 1   TSLA    1007 non-null   float16
 2   GOOG    949 non-null    float16
 3   IBM     1007 non-null   float16
 4   AAPL    1007 non-null   float16
dtypes: float16(5)
memory usage: 17.7 KB


(If you install s3fs, you can even read CSVs from s3://BUCKET/KEY/...csv.gz urls!)

## Reading from external APIs

There are some data sources for market data available in the pandas_datareader package:

In [88]:
!pip install -U pandas_datareader

Looking in links: /home/rick446/src/wheelhouse
Requirement already up-to-date: pandas_datareader in /home/rick446/.virtualenvs/classes/lib/python3.8/site-packages (0.10.0)
You should consider upgrading via the '/home/rick446/.virtualenvs/classes/bin/python -m pip install --upgrade pip' command.[0m


In [89]:
from datetime import datetime

import pandas_datareader.data as web

start, end = datetime(2016, 1, 1), datetime(2020, 1, 1)
data = web.DataReader(
    ['F', 'TSLA', 'GOOG', 'IBM', 'AAPL', 'CRM'], 
    'yahoo', start, end,
)
data.head()

Attributes,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,Close,Close,...,Open,Open,Open,Open,Volume,Volume,Volume,Volume,Volume,Volume
Symbols,F,TSLA,GOOG,IBM,AAPL,CRM,F,TSLA,GOOG,IBM,...,GOOG,IBM,AAPL,CRM,F,TSLA,GOOG,IBM,AAPL,CRM
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2016-01-04,10.638138,44.681999,741.840027,105.509605,24.286827,76.709999,13.97,44.681999,741.840027,135.949997,...,743.0,135.600006,25.6525,77.139999,38618500.0,34135500.0,3272800.0,5229400.0,270597600.0,4919200.0
2016-01-05,10.447762,44.686001,742.580017,105.431984,23.678223,77.050003,13.72,44.686001,742.580017,135.850006,...,746.450012,136.759995,26.4375,77.07,50267500.0,15934000.0,1950700.0,3924800.0,223164000.0,2656800.0
2016-01-06,9.983247,43.807999,743.619995,104.904243,23.214844,76.290001,13.11,43.807999,743.619995,135.169998,...,730.0,134.380005,25.139999,75.720001,61285500.0,18895500.0,1947000.0,4310900.0,273829600.0,3484400.0
2016-01-07,9.671034,43.130001,726.390015,103.111473,22.235073,74.300003,12.7,43.130001,726.390015,132.860001,...,730.309998,133.699997,24.67,75.129997,57846700.0,17771500.0,2963700.0,7025800.0,324377600.0,6972200.0
2016-01-08,9.549195,42.200001,714.469971,102.156891,22.35265,73.230003,12.54,42.200001,714.469971,131.630005,...,731.450012,133.179993,24.637501,74.779999,46199400.0,18140500.0,2450900.0,4762700.0,283192000.0,3673800.0


In [90]:
data['Close'].head()

Symbols,F,TSLA,GOOG,IBM,AAPL,CRM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,13.97,44.681999,741.840027,135.949997,26.3375,76.709999
2016-01-05,13.72,44.686001,742.580017,135.850006,25.6775,77.050003
2016-01-06,13.11,43.807999,743.619995,135.169998,25.174999,76.290001
2016-01-07,12.7,43.130001,726.390015,132.860001,24.112499,74.300003
2016-01-08,12.54,42.200001,714.469971,131.630005,24.24,73.230003


In [91]:
data.loc[:, ('Close', "CRM")]

Date
2016-01-04     76.709999
2016-01-05     77.050003
2016-01-06     76.290001
2016-01-07     74.300003
2016-01-08     73.230003
                 ...    
2019-12-24    163.250000
2019-12-26    164.509995
2019-12-27    164.979996
2019-12-30    162.440002
2019-12-31    162.639999
Name: (Close, CRM), Length: 1006, dtype: float64

In [92]:
data.columns

MultiIndex([('Adj Close',    'F'),
            ('Adj Close', 'TSLA'),
            ('Adj Close', 'GOOG'),
            ('Adj Close',  'IBM'),
            ('Adj Close', 'AAPL'),
            ('Adj Close',  'CRM'),
            (    'Close',    'F'),
            (    'Close', 'TSLA'),
            (    'Close', 'GOOG'),
            (    'Close',  'IBM'),
            (    'Close', 'AAPL'),
            (    'Close',  'CRM'),
            (     'High',    'F'),
            (     'High', 'TSLA'),
            (     'High', 'GOOG'),
            (     'High',  'IBM'),
            (     'High', 'AAPL'),
            (     'High',  'CRM'),
            (      'Low',    'F'),
            (      'Low', 'TSLA'),
            (      'Low', 'GOOG'),
            (      'Low',  'IBM'),
            (      'Low', 'AAPL'),
            (      'Low',  'CRM'),
            (     'Open',    'F'),
            (     'Open', 'TSLA'),
            (     'Open', 'GOOG'),
            (     'Open',  'IBM'),
            (     'O

In [93]:
data.columns.levels

FrozenList([['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], ['F', 'TSLA', 'GOOG', 'IBM', 'AAPL', 'CRM']])

In [94]:
dfs = {
    attr: data[attr]
    for attr in data.columns.levels[0]
}

In [95]:
dfs['Volume'].head()

Symbols,F,TSLA,GOOG,IBM,AAPL,CRM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,38618500.0,34135500.0,3272800.0,5229400.0,270597600.0,4919200.0
2016-01-05,50267500.0,15934000.0,1950700.0,3924800.0,223164000.0,2656800.0
2016-01-06,61285500.0,18895500.0,1947000.0,4310900.0,273829600.0,3484400.0
2016-01-07,57846700.0,17771500.0,2963700.0,7025800.0,324377600.0,6972200.0
2016-01-08,46199400.0,18140500.0,2450900.0,4762700.0,283192000.0,3673800.0


In [96]:
data.columns = data.columns.swaplevel()
data.head()

Symbols,F,TSLA,GOOG,IBM,AAPL,CRM,F,TSLA,GOOG,IBM,...,GOOG,IBM,AAPL,CRM,F,TSLA,GOOG,IBM,AAPL,CRM
Attributes,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,Close,Close,...,Open,Open,Open,Open,Volume,Volume,Volume,Volume,Volume,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2016-01-04,10.638138,44.681999,741.840027,105.509605,24.286827,76.709999,13.97,44.681999,741.840027,135.949997,...,743.0,135.600006,25.6525,77.139999,38618500.0,34135500.0,3272800.0,5229400.0,270597600.0,4919200.0
2016-01-05,10.447762,44.686001,742.580017,105.431984,23.678223,77.050003,13.72,44.686001,742.580017,135.850006,...,746.450012,136.759995,26.4375,77.07,50267500.0,15934000.0,1950700.0,3924800.0,223164000.0,2656800.0
2016-01-06,9.983247,43.807999,743.619995,104.904243,23.214844,76.290001,13.11,43.807999,743.619995,135.169998,...,730.0,134.380005,25.139999,75.720001,61285500.0,18895500.0,1947000.0,4310900.0,273829600.0,3484400.0
2016-01-07,9.671034,43.130001,726.390015,103.111473,22.235073,74.300003,12.7,43.130001,726.390015,132.860001,...,730.309998,133.699997,24.67,75.129997,57846700.0,17771500.0,2963700.0,7025800.0,324377600.0,6972200.0
2016-01-08,9.549195,42.200001,714.469971,102.156891,22.35265,73.230003,12.54,42.200001,714.469971,131.630005,...,731.450012,133.179993,24.637501,74.779999,46199400.0,18140500.0,2450900.0,4762700.0,283192000.0,3673800.0


In [97]:
data['CRM'].head()

Attributes,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,76.709999,76.709999,77.209999,75.639999,77.139999,4919200.0
2016-01-05,77.050003,77.050003,77.970001,76.669998,77.07,2656800.0
2016-01-06,76.290001,76.290001,77.330002,75.629997,75.720001,3484400.0
2016-01-07,74.300003,74.300003,75.599998,73.470001,75.129997,6972200.0
2016-01-08,73.230003,73.230003,75.260002,72.900002,74.779999,3673800.0


(If Yahoo finance won't work for us)

In [None]:
dfs = {}
for attr in ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']:
    dfs[attr] = pd.read_excel('./data/stocks.xlsx', attr, index_col='Date')

In [None]:
dfs['Close']

## Writing Excel data

We can write a multi-page Excel file using an ExcelWriter:

In [98]:
!pip install xlrd xlwt openpyxl

Looking in links: /home/rick446/src/wheelhouse
You should consider upgrading via the '/home/rick446/.virtualenvs/classes/bin/python -m pip install --upgrade pip' command.[0m


In normal python to write a file you might say:

```python
with open(filename, 'w') as fp:
    fp.write(some_data)
```

In [99]:
with pd.ExcelWriter('./data/stocks.xlsx') as writer:
    for name, sheet in dfs.items():
        sheet.to_excel(writer, name)

In [100]:
!file data/stocks.xlsx

data/stocks.xlsx: Microsoft Excel 2007+


## Reading Excel data

We can also read a sheet from an Excel workbook:

In [101]:
closing = pd.read_excel('./data/stocks.xlsx', 'Close', index_col='Date')
closing.head()

Unnamed: 0_level_0,F,TSLA,GOOG,IBM,AAPL,CRM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,13.97,44.681999,741.840027,135.949997,26.3375,76.709999
2016-01-05,13.72,44.686001,742.580017,135.850006,25.6775,77.050003
2016-01-06,13.11,43.807999,743.619995,135.169998,25.174999,76.290001
2016-01-07,12.7,43.130001,726.390015,132.860001,24.112499,74.300003
2016-01-08,12.54,42.200001,714.469971,131.630005,24.24,73.230003


In [102]:
closing.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1006 entries, 2016-01-04 to 2019-12-31
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   F       1006 non-null   float64
 1   TSLA    1006 non-null   float64
 2   GOOG    1006 non-null   float64
 3   IBM     1006 non-null   float64
 4   AAPL    1006 non-null   float64
 5   CRM     1006 non-null   float64
dtypes: float64(6)
memory usage: 55.0 KB


## Data from SQL

In [103]:
import pandas as pd
import sqlite3
con = sqlite3.connect('./data/real-estate.db')

In [104]:
transactions = pd.read_sql(
    'SELECT * FROM transactions', con, 
    index_col='index', 
    parse_dates=['sale_date'],
)
transactions.head()



Unnamed: 0_level_0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,2008-05-21,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,2008-05-21,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,2008-05-21,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,2008-05-21,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,2008-05-21,81900,38.51947,-121.435768


We can even build a quick little bulk load function in a couple of lines of pandas:

In [107]:
stock = pd.read_csv('./data/closing-prices.csv', index_col=[0], parse_dates=True)
stock.to_sql('stock', con, if_exists='replace')

In [108]:
for row in con.execute('select * from stock limit 5'):
    print(row)

('2014-01-02 00:00:00', 12.089, 150.1, None, 157.6001, 72.7741)
('2014-01-03 00:00:00', 12.1438, 149.56, None, 158.543, 71.1756)
('2014-01-06 00:00:00', 12.1986, 147.0, None, 157.9993, 71.5637)
('2014-01-07 00:00:00', 12.042, 149.36, None, 161.1508, 71.0516)
('2014-01-08 00:00:00', 12.1673, 151.28, None, 159.6728, 71.5019)


In [109]:
con.execute('select count(*) from stock').fetchall()

[(1007,)]

(for non-sqlite3 databases, you must use a sqlalchemy engine object and the `sqlalchemy.create_engine` function)

## Data from HTML

In [110]:
!pip install html5lib

Looking in links: /home/rick446/src/wheelhouse
You should consider upgrading via the '/home/rick446/.virtualenvs/classes/bin/python -m pip install --upgrade pip' command.[0m


In [111]:
tables = pd.read_html(
    'https://en.wikipedia.org/wiki/Python_(genus)',
)

In [112]:
len(tables)

8

In [113]:
tables[0]

Unnamed: 0,PythonTemporal range: Miocene–Present PreꞒ Ꞓ O S D C P T J K Pg N,PythonTemporal range: Miocene–Present PreꞒ Ꞓ O S D C P T J K Pg N.1
0,,
1,Burmese python (Python bivittatus),Burmese python (Python bivittatus)
2,Scientific classification,Scientific classification
3,Kingdom:,Animalia
4,Phylum:,Chordata
5,Class:,Reptilia
6,Order:,Squamata
7,Suborder:,Serpentes
8,Family:,Pythonidae
9,Genus:,"PythonDaudin, 1803"


In [114]:
tables = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population',
    match='New York'
)
len(tables)

4

In [115]:
tables[0]

Unnamed: 0,2020rank,City,State[c],2020census,2010census,Change,2020 land area,2020 land area.1,2020 population density,2020 population density.1,Location
0,1,New York[d],New York,8804190,8175133,+7.69%,300.5 sq mi,778.3 km2,"29,298/sq mi","11,312/km2",".mw-parser-output .geo-default,.mw-parser-outp..."
1,2,Los Angeles,California,3898747,3792621,+2.80%,469.5 sq mi,"1,216.0 km2","8,304/sq mi","3,206/km2",34°01′N 118°25′W﻿ / ﻿34.01°N 118.41°W
2,3,Chicago,Illinois,2746388,2695598,+1.88%,227.7 sq mi,589.7 km2,"12,061/sq mi","4,657/km2",41°50′N 87°41′W﻿ / ﻿41.83°N 87.68°W
3,4,Houston,Texas,2304580,2099451,+9.77%,640.4 sq mi,"1,658.6 km2","3,599/sq mi","1,390/km2",29°47′N 95°23′W﻿ / ﻿29.78°N 95.39°W
4,5,Phoenix,Arizona,1608139,1445632,+11.24%,518.0 sq mi,"1,341.6 km2","3,105/sq mi","1,199/km2",33°34′N 112°05′W﻿ / ﻿33.57°N 112.09°W
...,...,...,...,...,...,...,...,...,...,...,...
321,322,Federal Way,Washington,101030,89306,+13.13%,22.3 sq mi,57.8 km2,"4,530/sq mi","1,750/km2",47°19′N 122°21′W﻿ / ﻿47.32°N 122.35°W
322,323,Clinton,Michigan,100513,96796,+3.84%,28.1 sq mi,72.8 km2,"3,577/sq mi","1,381/km2",42°35′N 82°55′W﻿ / ﻿42.59°N 82.92°W
323,324,Edinburg,Texas,100243,77100,+30.02%,44.7 sq mi,115.8 km2,"2,243/sq mi",866/km2,26°18′N 98°10′W﻿ / ﻿26.30°N 98.16°W
324,325,Nampa,Idaho,100200,81557,+22.86%,33.5 sq mi,86.8 km2,"2,991/sq mi","1,155/km2",43°34′N 116°34′W﻿ / ﻿43.57°N 116.56°W


In [117]:
tables[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   2020rank                   326 non-null    int64 
 1   City                       326 non-null    object
 2   State[c]                   326 non-null    object
 3   2020census                 326 non-null    int64 
 4   2010census                 326 non-null    int64 
 5   Change                     326 non-null    object
 6   2020 land area             326 non-null    object
 7   2020 land area.1           326 non-null    object
 8   2020 population density    326 non-null    object
 9   2020 population density.1  326 non-null    object
 10  Location                   326 non-null    object
dtypes: int64(3), object(8)
memory usage: 28.1+ KB


In [116]:
tables[1].head()

Unnamed: 0,City,State,2020 Census,Peak population,Percent decline from peak population,Notes
0,Albany,New York,99224.0,134995,−26.50%,"Peak in 1950, +1.40% since 2010."
1,Allegheny,Pennsylvania,,129896,,"Peak as an independent city, annexed by Pittsb..."
2,Brooklyn,New York,,806343,,"Peak as an independent city, consolidated with..."
3,Camden,New Jersey,71791.0,124555,−42.36%,"Peak in 1950, −7.18% since 2010."
4,Canton,Ohio,70872.0,116912,−39.38%,"Peak in 1950, −2.92% since 2010."


In [118]:
tables[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   City                                  24 non-null     object 
 1   State                                 24 non-null     object 
 2   2020 Census                           22 non-null     float64
 3   Peak population                       24 non-null     int64  
 4   Percent decline from peak population  22 non-null     object 
 5   Notes                                 24 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.2+ KB


## Data from JSON APIs

In [None]:
!pip install requests

In [119]:
import requests

# I don't have any idea who's API key this is, but they're free, so....
APPID = '10d4440bbaa8581bb8da9bd1fbea5617'   
UNITS = 'imperial'
city = 'San Francisco'
resp = requests.get(
    'http://api.openweathermap.org/data/2.5/forecast', 
    params={
        'q': city,
        'units': UNITS,
        'appid': APPID,
    }
)
data = resp.json()

In [120]:
data

{'cod': '200',
 'message': 0,
 'cnt': 40,
 'list': [{'dt': 1635962400,
   'main': {'temp': 62.73,
    'feels_like': 62.65,
    'temp_min': 62.04,
    'temp_max': 62.73,
    'pressure': 1021,
    'sea_level': 1021,
    'grnd_level': 1019,
    'humidity': 84,
    'temp_kf': 0.38},
   'weather': [{'id': 802,
     'main': 'Clouds',
     'description': 'scattered clouds',
     'icon': '03d'}],
   'clouds': {'all': 40},
   'wind': {'speed': 1.72, 'deg': 94, 'gust': 2.35},
   'visibility': 10000,
   'pop': 0,
   'sys': {'pod': 'd'},
   'dt_txt': '2021-11-03 18:00:00'},
  {'dt': 1635973200,
   'main': {'temp': 63.64,
    'feels_like': 63.37,
    'temp_min': 63.64,
    'temp_max': 65.5,
    'pressure': 1020,
    'sea_level': 1020,
    'grnd_level': 1017,
    'humidity': 78,
    'temp_kf': -1.03},
   'weather': [{'id': 802,
     'main': 'Clouds',
     'description': 'scattered clouds',
     'icon': '03d'}],
   'clouds': {'all': 27},
   'wind': {'speed': 1.3, 'deg': 249, 'gust': 2.06},
   'visibi

In [121]:
data['list'][0]

{'dt': 1635962400,
 'main': {'temp': 62.73,
  'feels_like': 62.65,
  'temp_min': 62.04,
  'temp_max': 62.73,
  'pressure': 1021,
  'sea_level': 1021,
  'grnd_level': 1019,
  'humidity': 84,
  'temp_kf': 0.38},
 'weather': [{'id': 802,
   'main': 'Clouds',
   'description': 'scattered clouds',
   'icon': '03d'}],
 'clouds': {'all': 40},
 'wind': {'speed': 1.72, 'deg': 94, 'gust': 2.35},
 'visibility': 10000,
 'pop': 0,
 'sys': {'pod': 'd'},
 'dt_txt': '2021-11-03 18:00:00'}

In [122]:
row = data['list'][0]
{
    'date': row['dt_txt'], 
    **row['main'], 
    **row['weather'][0]
} 

{'date': '2021-11-03 18:00:00',
 'temp': 62.73,
 'feels_like': 62.65,
 'temp_min': 62.04,
 'temp_max': 62.73,
 'pressure': 1021,
 'sea_level': 1021,
 'grnd_level': 1019,
 'humidity': 84,
 'temp_kf': 0.38,
 'id': 802,
 'main': 'Clouds',
 'description': 'scattered clouds',
 'icon': '03d'}

In [123]:
# Python magic to build a list of dicts

raw_data = [
    {
        'date': row['dt_txt'], 
        **row['main'], 
        **row['weather'][0]
    } 
    for row in data['list']
]

In [124]:
raw_data[0]

{'date': '2021-11-03 18:00:00',
 'temp': 62.73,
 'feels_like': 62.65,
 'temp_min': 62.04,
 'temp_max': 62.73,
 'pressure': 1021,
 'sea_level': 1021,
 'grnd_level': 1019,
 'humidity': 84,
 'temp_kf': 0.38,
 'id': 802,
 'main': 'Clouds',
 'description': 'scattered clouds',
 'icon': '03d'}

In [125]:
weather = pd.DataFrame.from_dict(raw_data)
weather.head()

Unnamed: 0,date,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,temp_kf,id,main,description,icon
0,2021-11-03 18:00:00,62.73,62.65,62.04,62.73,1021,1021,1019,84,0.38,802,Clouds,scattered clouds,03d
1,2021-11-03 21:00:00,63.64,63.37,63.64,65.5,1020,1020,1017,78,-1.03,802,Clouds,scattered clouds,03d
2,2021-11-04 00:00:00,64.36,63.88,64.36,65.19,1018,1018,1015,72,-0.46,801,Clouds,few clouds,02d
3,2021-11-04 03:00:00,62.17,61.43,62.17,62.17,1018,1018,1016,71,0.0,800,Clear,clear sky,01n
4,2021-11-04 06:00:00,62.6,61.95,62.6,62.6,1018,1018,1016,72,0.0,802,Clouds,scattered clouds,03n


In [126]:
weather['date'] = pd.to_datetime(weather['date'])
weather.set_index('date', inplace=True)
weather.head()

Unnamed: 0_level_0,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,temp_kf,id,main,description,icon
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-11-03 18:00:00,62.73,62.65,62.04,62.73,1021,1021,1019,84,0.38,802,Clouds,scattered clouds,03d
2021-11-03 21:00:00,63.64,63.37,63.64,65.5,1020,1020,1017,78,-1.03,802,Clouds,scattered clouds,03d
2021-11-04 00:00:00,64.36,63.88,64.36,65.19,1018,1018,1015,72,-0.46,801,Clouds,few clouds,02d
2021-11-04 03:00:00,62.17,61.43,62.17,62.17,1018,1018,1016,71,0.0,800,Clear,clear sky,01n
2021-11-04 06:00:00,62.6,61.95,62.6,62.6,1018,1018,1016,72,0.0,802,Clouds,scattered clouds,03n


In [127]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 40 entries, 2021-11-03 18:00:00 to 2021-11-08 15:00:00
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   temp         40 non-null     float64
 1   feels_like   40 non-null     float64
 2   temp_min     40 non-null     float64
 3   temp_max     40 non-null     float64
 4   pressure     40 non-null     int64  
 5   sea_level    40 non-null     int64  
 6   grnd_level   40 non-null     int64  
 7   humidity     40 non-null     int64  
 8   temp_kf      40 non-null     float64
 9   id           40 non-null     int64  
 10  main         40 non-null     object 
 11  description  40 non-null     object 
 12  icon         40 non-null     object 
dtypes: float64(5), int64(5), object(3)
memory usage: 4.4+ KB


In [128]:
pd.json_normalize()

Looking in links: /home/rick446/src/wheelhouse
[31mERROR: Could not find a version that satisfies the requirement pd_json (from versions: none)[0m
[31mERROR: No matching distribution found for pd_json[0m
You should consider upgrading via the '/home/rick446/.virtualenvs/classes/bin/python -m pip install --upgrade pip' command.[0m


Much easier...

In [133]:
pd.json_normalize(raw_data).head()

Unnamed: 0,date,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,temp_kf,id,main,description,icon
0,2021-11-03 18:00:00,62.73,62.65,62.04,62.73,1021,1021,1019,84,0.38,802,Clouds,scattered clouds,03d
1,2021-11-03 21:00:00,63.64,63.37,63.64,65.5,1020,1020,1017,78,-1.03,802,Clouds,scattered clouds,03d
2,2021-11-04 00:00:00,64.36,63.88,64.36,65.19,1018,1018,1015,72,-0.46,801,Clouds,few clouds,02d
3,2021-11-04 03:00:00,62.17,61.43,62.17,62.17,1018,1018,1016,71,0.0,800,Clear,clear sky,01n
4,2021-11-04 06:00:00,62.6,61.95,62.6,62.6,1018,1018,1016,72,0.0,802,Clouds,scattered clouds,03n


## Writing csv data

In [134]:
weather.to_csv('./data/weather.csv')

In [135]:
!head data/weather.csv

date,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,temp_kf,id,main,description,icon
2021-11-03 18:00:00,62.73,62.65,62.04,62.73,1021,1021,1019,84,0.38,802,Clouds,scattered clouds,03d
2021-11-03 21:00:00,63.64,63.37,63.64,65.5,1020,1020,1017,78,-1.03,802,Clouds,scattered clouds,03d
2021-11-04 00:00:00,64.36,63.88,64.36,65.19,1018,1018,1015,72,-0.46,801,Clouds,few clouds,02d
2021-11-04 03:00:00,62.17,61.43,62.17,62.17,1018,1018,1016,71,0.0,800,Clear,clear sky,01n
2021-11-04 06:00:00,62.6,61.95,62.6,62.6,1018,1018,1016,72,0.0,802,Clouds,scattered clouds,03n
2021-11-04 09:00:00,60.67,60.71,60.67,60.67,1018,1018,1016,91,0.0,500,Rain,light rain,10n
2021-11-04 12:00:00,60.87,61.07,60.87,60.87,1018,1018,1017,94,0.0,500,Rain,light rain,10n
2021-11-04 15:00:00,58.96,58.89,58.96,58.96,1020,1020,1018,92,0.0,500,Rain,light rain,10d
2021-11-04 18:00:00,60.1,59.52,60.1,60.1,1022,1022,1020,79,0.0,500,Rain,light rain,10d


JSON lines?

In [136]:
weather.head()

Unnamed: 0_level_0,temp,feels_like,temp_min,temp_max,pressure,sea_level,grnd_level,humidity,temp_kf,id,main,description,icon
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-11-03 18:00:00,62.73,62.65,62.04,62.73,1021,1021,1019,84,0.38,802,Clouds,scattered clouds,03d
2021-11-03 21:00:00,63.64,63.37,63.64,65.5,1020,1020,1017,78,-1.03,802,Clouds,scattered clouds,03d
2021-11-04 00:00:00,64.36,63.88,64.36,65.19,1018,1018,1015,72,-0.46,801,Clouds,few clouds,02d
2021-11-04 03:00:00,62.17,61.43,62.17,62.17,1018,1018,1016,71,0.0,800,Clear,clear sky,01n
2021-11-04 06:00:00,62.6,61.95,62.6,62.6,1018,1018,1016,72,0.0,802,Clouds,scattered clouds,03n


In [137]:
weather.reset_index().to_json('./data/weather.jsonlines', orient='records', lines=True)

In [138]:
!cat data/weather.jsonlines

{"date":1635962400000,"temp":62.73,"feels_like":62.65,"temp_min":62.04,"temp_max":62.73,"pressure":1021,"sea_level":1021,"grnd_level":1019,"humidity":84,"temp_kf":0.38,"id":802,"main":"Clouds","description":"scattered clouds","icon":"03d"}
{"date":1635973200000,"temp":63.64,"feels_like":63.37,"temp_min":63.64,"temp_max":65.5,"pressure":1020,"sea_level":1020,"grnd_level":1017,"humidity":78,"temp_kf":-1.03,"id":802,"main":"Clouds","description":"scattered clouds","icon":"03d"}
{"date":1635984000000,"temp":64.36,"feels_like":63.88,"temp_min":64.36,"temp_max":65.19,"pressure":1018,"sea_level":1018,"grnd_level":1015,"humidity":72,"temp_kf":-0.46,"id":801,"main":"Clouds","description":"few clouds","icon":"02d"}
{"date":1635994800000,"temp":62.17,"feels_like":61.43,"temp_min":62.17,"temp_max":62.17,"pressure":1018,"sea_level":1018,"grnd_level":1016,"humidity":71,"temp_kf":0.0,"id":800,"main":"Clear","description":"clear sky","icon":"01n"}
{"date":1636005600000,"temp":62.6,"feels_like":61.

In [139]:
df = pd.read_json('./data/weather.jsonlines', lines=True)

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         40 non-null     datetime64[ns]
 1   temp         40 non-null     float64       
 2   feels_like   40 non-null     float64       
 3   temp_min     40 non-null     float64       
 4   temp_max     40 non-null     float64       
 5   pressure     40 non-null     int64         
 6   sea_level    40 non-null     int64         
 7   grnd_level   40 non-null     int64         
 8   humidity     40 non-null     int64         
 9   temp_kf      40 non-null     float64       
 10  id           40 non-null     int64         
 11  main         40 non-null     object        
 12  description  40 non-null     object        
 13  icon         40 non-null     object        
dtypes: datetime64[ns](1), float64(5), int64(5), object(3)
memory usage: 4.5+ KB


In [141]:
weather.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 40 entries, 2021-11-03 18:00:00 to 2021-11-08 15:00:00
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   temp         40 non-null     float64
 1   feels_like   40 non-null     float64
 2   temp_min     40 non-null     float64
 3   temp_max     40 non-null     float64
 4   pressure     40 non-null     int64  
 5   sea_level    40 non-null     int64  
 6   grnd_level   40 non-null     int64  
 7   humidity     40 non-null     int64  
 8   temp_kf      40 non-null     float64
 9   id           40 non-null     int64  
 10  main         40 non-null     object 
 11  description  40 non-null     object 
 12  icon         40 non-null     object 
dtypes: float64(5), int64(5), object(3)
memory usage: 11.0 KB


In [142]:
%timeit weather.temp * 5

117 µs ± 4.38 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [143]:
weather.temp.values

array([62.73, 63.64, 64.36, 62.17, 62.6 , 60.67, 60.87, 58.96, 60.1 ,
       61.75, 61.07, 58.15, 56.91, 55.65, 54.39, 53.92, 57.4 , 60.21,
       60.35, 58.24, 56.82, 56.57, 56.35, 56.66, 58.86, 56.26, 57.45,
       55.33, 53.8 , 52.99, 52.32, 49.91, 53.31, 57.85, 58.75, 55.72,
       54.48, 52.99, 51.58, 50.85])

In [144]:
%timeit weather.temp.values * 5

8.24 µs ± 596 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [145]:
pd.Series(weather.temp.values * 5, index=weather.temp)

temp
62.73    313.65
63.64    318.20
64.36    321.80
62.17    310.85
62.60    313.00
60.67    303.35
60.87    304.35
58.96    294.80
60.10    300.50
61.75    308.75
61.07    305.35
58.15    290.75
56.91    284.55
55.65    278.25
54.39    271.95
53.92    269.60
57.40    287.00
60.21    301.05
60.35    301.75
58.24    291.20
56.82    284.10
56.57    282.85
56.35    281.75
56.66    283.30
58.86    294.30
56.26    281.30
57.45    287.25
55.33    276.65
53.80    269.00
52.99    264.95
52.32    261.60
49.91    249.55
53.31    266.55
57.85    289.25
58.75    293.75
55.72    278.60
54.48    272.40
52.99    264.95
51.58    257.90
50.85    254.25
dtype: float64

Open the [Pandas IO Lab][pandas-io-lab]

[pandas-io-lab]: ./pandas-io-lab.ipynb