# Workshop - pandas Data Structures Intro

Work from the pandas documentation:
- [`https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html`](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html)

You might enjoy modifying some of the examples to use the diamonds dataframe: 

In [1]:
import numpy as np 
import pandas as pd
diamonds_pdf = pd.read_csv('https://raw.githubusercontent.com/datalab-datasets/file-samples/master/diamonds.csv').drop(columns='Unnamed: 0') 
diamonds_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
price      53940 non-null int64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [2]:
diamonds_pdf.index

RangeIndex(start=0, stop=53940, step=1)

In [3]:
diamonds_pdf.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [4]:
del diamonds_pdf['x']

In [5]:
color = diamonds_pdf.pop("color")
color

0        E
1        E
2        E
3        I
4        J
5        J
6        I
7        H
8        E
9        H
10       J
11       J
12       F
13       J
14       E
15       E
16       I
17       J
18       J
19       J
20       I
21       E
22       H
23       J
24       J
25       G
26       I
27       J
28       D
29       F
        ..
53910    E
53911    E
53912    F
53913    G
53914    I
53915    E
53916    D
53917    J
53918    I
53919    I
53920    E
53921    E
53922    D
53923    I
53924    I
53925    I
53926    E
53927    F
53928    E
53929    G
53930    E
53931    F
53932    E
53933    E
53934    D
53935    D
53936    D
53937    D
53938    H
53939    D
Name: color, Length: 53940, dtype: object

In [6]:
diamonds_pdf['y'] = 'bar'
diamonds_pdf

Unnamed: 0,carat,cut,clarity,depth,table,price,y,z
0,0.23,Ideal,SI2,61.5,55.0,326,bar,2.43
1,0.21,Premium,SI1,59.8,61.0,326,bar,2.31
2,0.23,Good,VS1,56.9,65.0,327,bar,2.31
3,0.29,Premium,VS2,62.4,58.0,334,bar,2.63
4,0.31,Good,SI2,63.3,58.0,335,bar,2.75
5,0.24,Very Good,VVS2,62.8,57.0,336,bar,2.48
6,0.24,Very Good,VVS1,62.3,57.0,336,bar,2.47
7,0.26,Very Good,SI1,61.9,55.0,337,bar,2.53
8,0.22,Fair,VS2,65.1,61.0,337,bar,2.49
9,0.23,Very Good,VS1,59.4,61.0,338,bar,2.39


In [7]:
diamonds_pdf['z'] = diamonds_pdf['y'][:3]
diamonds_pdf

Unnamed: 0,carat,cut,clarity,depth,table,price,y,z
0,0.23,Ideal,SI2,61.5,55.0,326,bar,bar
1,0.21,Premium,SI1,59.8,61.0,326,bar,bar
2,0.23,Good,VS1,56.9,65.0,327,bar,bar
3,0.29,Premium,VS2,62.4,58.0,334,bar,
4,0.31,Good,SI2,63.3,58.0,335,bar,
5,0.24,Very Good,VVS2,62.8,57.0,336,bar,
6,0.24,Very Good,VVS1,62.3,57.0,336,bar,
7,0.26,Very Good,SI1,61.9,55.0,337,bar,
8,0.22,Fair,VS2,65.1,61.0,337,bar,
9,0.23,Very Good,VS1,59.4,61.0,338,bar,


In [8]:
diamonds_pdf.insert(1, "bar", diamonds_pdf['table'])
diamonds_pdf

Unnamed: 0,carat,bar,cut,clarity,depth,table,price,y,z
0,0.23,55.0,Ideal,SI2,61.5,55.0,326,bar,bar
1,0.21,61.0,Premium,SI1,59.8,61.0,326,bar,bar
2,0.23,65.0,Good,VS1,56.9,65.0,327,bar,bar
3,0.29,58.0,Premium,VS2,62.4,58.0,334,bar,
4,0.31,58.0,Good,SI2,63.3,58.0,335,bar,
5,0.24,57.0,Very Good,VVS2,62.8,57.0,336,bar,
6,0.24,57.0,Very Good,VVS1,62.3,57.0,336,bar,
7,0.26,55.0,Very Good,SI1,61.9,55.0,337,bar,
8,0.22,61.0,Fair,VS2,65.1,61.0,337,bar,
9,0.23,61.0,Very Good,VS1,59.4,61.0,338,bar,


In [9]:
diamonds_pdf.assign(price_fe = lambda x: (x['price'] - diamonds_pdf.price.mean()))

Unnamed: 0,carat,bar,cut,clarity,depth,table,price,y,z,price_fe
0,0.23,55.0,Ideal,SI2,61.5,55.0,326,bar,bar,-3606.799722
1,0.21,61.0,Premium,SI1,59.8,61.0,326,bar,bar,-3606.799722
2,0.23,65.0,Good,VS1,56.9,65.0,327,bar,bar,-3605.799722
3,0.29,58.0,Premium,VS2,62.4,58.0,334,bar,,-3598.799722
4,0.31,58.0,Good,SI2,63.3,58.0,335,bar,,-3597.799722
5,0.24,57.0,Very Good,VVS2,62.8,57.0,336,bar,,-3596.799722
6,0.24,57.0,Very Good,VVS1,62.3,57.0,336,bar,,-3596.799722
7,0.26,55.0,Very Good,SI1,61.9,55.0,337,bar,,-3595.799722
8,0.22,61.0,Fair,VS2,65.1,61.0,337,bar,,-3595.799722
9,0.23,61.0,Very Good,VS1,59.4,61.0,338,bar,,-3594.799722


In [10]:
from sklearn import datasets 
iris = datasets.load_iris()
(iris.query('SepalLength > 5')
 .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength, 
         PetalRatio=lambda x: x.PetalWidth / x.PetalLength)
 .plot(kind='scatter', x='SepalRatio', y='PetalRatio'))

AttributeError: query

In [11]:
print(diamonds_pdf.iloc[-20:, :12].to_string())

       carat   bar        cut clarity  depth  table  price    y    z
53920   0.70  60.0  Very Good     VS2   62.4   60.0   2755  bar  NaN
53921   0.70  60.0  Very Good     VS2   62.8   60.0   2755  bar  NaN
53922   0.70  59.0  Very Good     VS1   63.1   59.0   2755  bar  NaN
53923   0.73  56.0      Ideal     VS2   61.3   56.0   2756  bar  NaN
53924   0.73  55.0      Ideal     VS2   61.6   55.0   2756  bar  NaN
53925   0.79  56.0      Ideal     SI1   61.6   56.0   2756  bar  NaN
53926   0.71  56.0      Ideal     SI1   61.9   56.0   2756  bar  NaN
53927   0.79  59.0       Good     SI1   58.1   59.0   2756  bar  NaN
53928   0.79  58.0    Premium     SI2   61.4   58.0   2756  bar  NaN
53929   0.71  56.0      Ideal     VS1   61.4   56.0   2756  bar  NaN
53930   0.71  55.0    Premium     SI1   60.5   55.0   2756  bar  NaN
53931   0.71  62.0    Premium     SI1   59.8   62.0   2756  bar  NaN
53932   0.70  59.0  Very Good     VS2   60.5   59.0   2757  bar  NaN
53933   0.70  59.0  Very Good     

In [12]:
pd.set_option("display.width", 40)
diamonds_pdf

Unnamed: 0,carat,bar,cut,clarity,depth,table,price,y,z
0,0.23,55.0,Ideal,SI2,61.5,55.0,326,bar,bar
1,0.21,61.0,Premium,SI1,59.8,61.0,326,bar,bar
2,0.23,65.0,Good,VS1,56.9,65.0,327,bar,bar
3,0.29,58.0,Premium,VS2,62.4,58.0,334,bar,
4,0.31,58.0,Good,SI2,63.3,58.0,335,bar,
5,0.24,57.0,Very Good,VVS2,62.8,57.0,336,bar,
6,0.24,57.0,Very Good,VVS1,62.3,57.0,336,bar,
7,0.26,55.0,Very Good,SI1,61.9,55.0,337,bar,
8,0.22,61.0,Fair,VS2,65.1,61.0,337,bar,
9,0.23,61.0,Very Good,VS1,59.4,61.0,338,bar,


# Time Series

In [14]:
rng = pd.date_range("1/1/2012", periods = 100, freq = "S")
ts = pd.Series(np.random.randint(0, 500, len(rng)), index = rng) 
ts.resample('5Min').sum()

2012-01-01    24320
Freq: 5T, dtype: int64

In [15]:
rng = pd.date_range("3/6/2012 00:00", periods = 5, freq = "D")
ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2012-03-06    0.745063
2012-03-07   -1.487039
2012-03-08    0.416992
2012-03-09    0.668778
2012-03-10    0.520620
Freq: D, dtype: float64

In [18]:
ts.tz_localize("UTC")

2012-03-06 00:00:00+00:00    0.745063
2012-03-07 00:00:00+00:00   -1.487039
2012-03-08 00:00:00+00:00    0.416992
2012-03-09 00:00:00+00:00    0.668778
2012-03-10 00:00:00+00:00    0.520620
Freq: D, dtype: float64

In [19]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00    0.745063
2012-03-06 19:00:00-05:00   -1.487039
2012-03-07 19:00:00-05:00    0.416992
2012-03-08 19:00:00-05:00    0.668778
2012-03-09 19:00:00-05:00    0.520620
Freq: D, dtype: float64

In [22]:
ps = ts.to_period()
ps

2012-03-06    0.745063
2012-03-07   -1.487039
2012-03-08    0.416992
2012-03-09    0.668778
2012-03-10    0.520620
Freq: D, dtype: float64

In [23]:
ps.to_timestamp()

2012-03-06    0.745063
2012-03-07   -1.487039
2012-03-08    0.416992
2012-03-09    0.668778
2012-03-10    0.520620
Freq: D, dtype: float64

The end. 