## Pandas Basics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
s = pd.Series(np.random.randn(5), index=['a','b', 'c', 'd', 'e'])

In [3]:
s

a    1.317439
b   -0.199233
c   -1.418735
d    0.302006
e   -0.553925
dtype: float64

In [4]:
s1 = pd.Series(np.random.randn(5))
s1

0    0.563142
1   -0.318036
2   -2.474008
3   -0.427638
4   -0.484247
dtype: float64

In [5]:
s

a    1.317439
b   -0.199233
c   -1.418735
d    0.302006
e   -0.553925
dtype: float64

In [6]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [7]:
pd.Series(np.random.randn(5))

0   -0.480258
1   -0.600458
2   -1.445224
3    0.233223
4   -2.112935
dtype: float64

In [8]:
d = {'a' : 0., 'b' : 1., 'c' : 2.}

In [9]:
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [10]:
pd.Series(d, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [11]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [12]:
s[0]

1.3174389963854

In [13]:
s[:3]

a    1.317439
b   -0.199233
c   -1.418735
dtype: float64

In [14]:
s['a']

1.3174389963854

In [15]:
s['e'] = 12.

In [16]:
s

a     1.317439
b    -0.199233
c    -1.418735
d     0.302006
e    12.000000
dtype: float64

In [17]:
s.get('a')

1.3174389963854

In [18]:
ts1 = pd.Series(np.random.randn(5))
ts2 = pd.Series(np.random.randn(5))

In [19]:
ts1

0    1.131470
1   -0.605285
2   -0.478777
3    0.430174
4   -0.208089
dtype: float64

In [20]:
ts2

0    0.382959
1    1.316035
2    0.859644
3    0.835510
4   -1.344956
dtype: float64

In [21]:
d = {'col1': ts1, 'col2': ts2}
d

{'col1': 0    1.131470
 1   -0.605285
 2   -0.478777
 3    0.430174
 4   -0.208089
 dtype: float64,
 'col2': 0    0.382959
 1    1.316035
 2    0.859644
 3    0.835510
 4   -1.344956
 dtype: float64}

In [22]:
df1 = pd.DataFrame(data = d)
df1

Unnamed: 0,col1,col2
0,1.13147,0.382959
1,-0.605285,1.316035
2,-0.478777,0.859644
3,0.430174,0.83551
4,-0.208089,-1.344956


In [23]:
df2 = pd.DataFrame(np.random.randn(10, 5))
df2

Unnamed: 0,0,1,2,3,4
0,1.812584,-0.237275,-0.51433,-2.081113,0.329803
1,-0.112303,-2.408119,-0.242485,-0.359542,-1.512469
2,-0.604121,-0.936288,1.686288,-2.600729,-0.641823
3,0.886257,-0.442723,-0.100998,1.159831,-1.38306
4,-0.91861,0.591564,-0.100787,1.672175,-2.613241
5,1.154625,1.035408,-0.266297,1.000656,-0.199245
6,-1.732302,0.194139,-0.251797,0.028681,-0.567652
7,-0.96129,0.198539,1.131522,-0.137051,1.662989
8,0.59073,-0.091854,-0.036664,1.154098,0.12527
9,-0.784683,-2.658198,0.267278,-0.730512,-0.300464


In [24]:
df3 = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
df3

Unnamed: 0,a,b,c,d,e
0,-0.357145,-1.050904,-0.376087,-0.170492,-0.144768
1,-0.065542,-0.488609,5.3e-05,-0.026273,0.047118
2,-0.212095,0.638837,-1.847354,-0.74637,-0.268454
3,-0.61268,1.751733,-0.643813,0.053053,0.170022
4,0.621945,-0.798373,2.404042,0.042854,1.049249
5,-0.44136,-0.591615,-0.595511,0.376328,-0.152926
6,0.727277,-1.72255,0.024357,-0.556166,1.2721
7,-0.298054,0.224161,-0.176273,-0.726145,0.665063
8,-0.219971,-0.019446,-0.805675,1.520708,0.710631
9,0.743145,0.431169,-0.239167,-0.493893,-0.126671


In [25]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [26]:
df.values

array([[ 1.,  1.],
       [ 2.,  2.],
       [ 3.,  3.],
       [nan,  4.]])

In [27]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [28]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [29]:
df.columns

Index(['one', 'two'], dtype='object')

In [30]:
df.index.hasnans

False

In [31]:
dfc = pd.read_csv('./data1.csv')
dfc

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.469112
1,2000-01-04,A,-0.282863
2,2000-01-05,A,-1.509059
3,2000-01-03,B,-1.135632
4,2000-01-04,B,1.212112
5,2000-01-05,B,-0.173215
6,2000-01-03,C,0.119209
7,2000-01-04,C,-1.044236
8,2000-01-05,C,-0.861849
9,2000-01-03,D,-2.104569


In [32]:
dfc['variable'] == 'A'

0      True
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
Name: variable, dtype: bool

In [33]:
dfc[dfc['variable'] == 'A']

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.469112
1,2000-01-04,A,-0.282863
2,2000-01-05,A,-1.509059


In [34]:
dfc.pivot(index='date', columns='variable', values='value')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,0.469112,-1.135632,0.119209,-2.104569
2000-01-04,-0.282863,1.212112,-1.044236,-0.494929
2000-01-05,-1.509059,-0.173215,-0.861849,1.071804


In [35]:
dfc.describe()

Unnamed: 0,value
count,12.0
mean,-0.39451
std,1.007649
min,-2.104569
25%,-1.067085
50%,-0.388896
75%,0.206685
max,1.212112
