# Getting Started with pandas

In [1]:
import pandas as pd
import numpy as np

In [4]:
test = pd.Series(np.arange(6), index=['a','b','c','d','e','f'])
index = test.index
(index > 'a') & (index < 'f')
'b' in test
sdata = {'c': 35000, 'b': 71000, 'a': 16000, 'z': 5000}
mySeries = pd.Series(sdata, index=index)
mySeries.index = ['America', 'Britain', 'Columbia', 'Django', 'Egbert', 'France']
print(mySeries)
mySeries[mySeries>2]

America     16000.0
Britain     71000.0
Columbia    35000.0
Django          NaN
Egbert          NaN
France          NaN
dtype: float64


America     16000.0
Britain     71000.0
Columbia    35000.0
dtype: float64

#### Data returned from indexing a dataframe is a view only

In [27]:
mydf = pd.DataFrame(np.arange(8).reshape(2, 4), columns=['Ohio', 'Texas', 'Oregon', 'Utah'])
print(mydf)
mydf2 = mydf['Oregon']
mydf2.loc[1] = 7 # a change in-place
mydf

   Ohio  Texas  Oregon  Utah
0     0      1       2     3
1     4      5       6     7


Unnamed: 0,Ohio,Texas,Oregon,Utah
0,0,1,2,3
1,4,5,7,7


#### Data returned from indexing a numpy array is a view only

In [30]:
arr = np.ones((3, 3), dtype = 'int')
varr0 = arr[1:,1:]
varr1 = arr[1:,1:]
# Case 1:
varr0 = 0
print(arr) # no change to original arr
# Case 2:
varr1[:] = 0 # a change in-place
print(arr) # original arr has been changed!

[[1 1 1]
 [1 1 1]
 [1 1 1]]
[[1 1 1]
 [1 0 0]
 [1 0 0]]


 - If a slice is assigned to an object; and
 - If this object (which is a view) is modified in part. That is, not when its overwritten altogether as in Case 1
 
 ...then, the original array is modified

#### A sample of several methods available with an Index object

In [17]:
index = pd.Index(['a', 'b', 'a'])
addnl = pd.Index(['a', 'b', 'c'])
addnl1 = addnl.delete(2)
addnl2 = addnl.drop(['c'])
addnl3 = addnl2.insert(2, 'c')
print(addnl1)
print(addnl2)
print(addnl3)

Index(['a', 'b'], dtype='object')
Index(['a', 'b'], dtype='object')
Index(['a', 'b', 'c'], dtype='object')


In [24]:
mydf = pd.DataFrame(np.arange(8).reshape(2, 4), columns=['Ohio', 'Texas', 'Oregon', 'Utah'])
print(mydf)
mydf = mydf.reindex([0, 1, 2, 3], method='ffill') # forward-fill
print(mydf)
states = ['Ohio', 'Texas', 'Oregon', 'Utah', 'West Virginia']
mydf = mydf.reindex(columns=states)
print(mydf)

   Ohio  Texas  Oregon  Utah
0     0      1       2     3
1     4      5       6     7
   Ohio  Texas  Oregon  Utah
0     0      1       2     3
1     4      5       6     7
2     4      5       6     7
3     4      5       6     7
   Ohio  Texas  Oregon  Utah  West Virginia
0     0      1       2     3            NaN
1     4      5       6     7            NaN
2     4      5       6     7            NaN
3     4      5       6     7            NaN


In [44]:
mydf = pd.DataFrame(np.arange(8).reshape(2, 4), columns=['Ohio', 'Texas', 'Oregon', 'Utah'])
print(mydf)
states = ['Ohio', 'Texas', 'Oregon', 'Utah', 'West Virginia']
mydf = mydf.reindex([0, 1, 2, 3], columns=states)
print(mydf)
mydf.drop([2, 3], axis=0).drop(['West Virginia'], axis=1)

   Ohio  Texas  Oregon  Utah
0     0      1       2     3
1     4      5       6     7
   Ohio  Texas  Oregon  Utah  West Virginia
0   0.0    1.0     2.0   3.0            NaN
1   4.0    5.0     6.0   7.0            NaN
2   NaN    NaN     NaN   NaN            NaN
3   NaN    NaN     NaN   NaN            NaN


Unnamed: 0,Ohio,Texas,Oregon,Utah
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0


In [38]:
mydf = pd.DataFrame(np.arange(8).reshape(2, 4), columns=['Ohio', 'Texas', 'Oregon', 'Utah'])
newIndex = pd.Index([1])
# ?pd.Index()
print(mydf.index)
print(newIndex)
mydf2 = pd.DataFrame(mydf, index = newIndex)
print(mydf2)
mydf.drop([1], inplace=True) # the mydf.index object is altered as a result
mydf.index

RangeIndex(start=0, stop=2, step=1)
Int64Index([1], dtype='int64')
   Ohio  Texas  Oregon  Utah
1     4      5       6     7


Int64Index([0], dtype='int64')

#### Indexing, Selection, and Filtering

In [49]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj1 = obj['b':'c']
obj1[:] = 5 # This is setting value on a view. Not on a copy!
print(obj) 

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [52]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame1 = frame.copy()
print(frame)
np.add(frame, frame1) # So, a binary ufunc also works with a pandas DataFrame

               b         d         e
Utah    1.583610  0.907536 -1.644846
Ohio    1.384444 -0.955096 -0.355440
Texas  -1.662108 -1.699967 -0.459282
Oregon -2.215677 -0.143443 -0.749859


Unnamed: 0,b,d,e
Utah,3.167219,1.815071,-3.289692
Ohio,2.768887,-1.910192,-0.710881
Texas,-3.324217,-3.399934,-0.918565
Oregon,-4.431354,-0.286886,-1.499719


In [73]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame1 = pd.DataFrame(np.arange(6).reshape((2, 3)), columns=list('bde'), index=['Utah', 'Ohio'])
print(frame)
print(frame1)
# np.add(frame, frame1) # This fails
# frame + frame1
allIndex = frame.index.append(frame1.index).unique()
allCols = frame.columns.append(frame1.columns).unique()
frame1 = frame1.reindex(allIndex, columns=allCols)
frame1.fillna(0, inplace=True)
print(frame1)
%timeit (frame + frame1)
%timeit np.add(frame, frame1)

        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
      b  d  e
Utah  0  1  2
Ohio  3  4  5
          b    d    e
Utah    0.0  1.0  2.0
Ohio    3.0  4.0  5.0
Texas   0.0  0.0  0.0
Oregon  0.0  0.0  0.0
411 µs ± 912 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
220 µs ± 614 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'], index=list('abcdefgh'))
df

Unnamed: 0,A,B,C,D
a,-1.29387,0.687329,-0.716578,0.40097
b,-0.281057,-0.782575,0.652666,0.934821
c,-0.096968,-0.986343,1.278637,0.438146
d,0.908033,-0.274317,-1.362896,-1.279395
e,-0.695339,0.083161,0.684865,-0.141858
f,-0.303511,0.002265,-0.853706,0.695146
g,-0.530764,-1.092332,-0.394616,1.309485
h,-0.444328,0.092078,-0.408571,-0.517995


In [12]:
s = df.iloc[3]
df.append(s, ignore_index=False)

Unnamed: 0,A,B,C,D
a,-1.29387,0.687329,-0.716578,0.40097
b,-0.281057,-0.782575,0.652666,0.934821
c,-0.096968,-0.986343,1.278637,0.438146
d,0.908033,-0.274317,-1.362896,-1.279395
e,-0.695339,0.083161,0.684865,-0.141858
f,-0.303511,0.002265,-0.853706,0.695146
g,-0.530764,-1.092332,-0.394616,1.309485
h,-0.444328,0.092078,-0.408571,-0.517995
d,0.908033,-0.274317,-1.362896,-1.279395


In [13]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

In [39]:
df2 = df.groupby(['A', 'B']).sum()
print(df)
df.stack()

     A      B         C         D
0  foo    one  2.641527 -0.446129
1  bar    one  0.242199 -0.021328
2  foo    two  0.788467 -0.812746
3  bar  three -0.444548 -0.019022
4  foo    two  1.068423  0.016479
5  bar    two -0.802127  0.195753
6  foo    one  0.449126  0.373463
7  foo  three -0.343600 -1.942340


0  A          foo
   B          one
   C      2.64153
   D    -0.446129
1  A          bar
   B          one
   C     0.242199
   D   -0.0213277
2  A          foo
   B          two
   C     0.788467
   D    -0.812746
3  A          bar
   B        three
   C    -0.444548
   D    -0.019022
4  A          foo
   B          two
   C      1.06842
   D    0.0164794
5  A          bar
   B          two
   C    -0.802127
   D     0.195753
6  A          foo
   B          one
   C     0.449126
   D     0.373463
7  A          foo
   B        three
   C      -0.3436
   D     -1.94234
dtype: object