In [9]:
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [14]:
#you dont need to specify elem labels, will index from zero automatically.
s0 = pd.Series(np.random.randn(5))
s0

#you can pad every cell with just one element (like 1 or 0)
s00 = pd.Series(1, index = [1,2,3,4,5])
s00

#an array of 5 random values passed in
s1 = pd.Series(np.random.randn(5), index = [1,2,3,4,5])
s1

#Lists are OK too.
s2 = pd.Series(['a','b','c','d','e'], index = [1,2,3,4,5])
s2

#you can use a 1 to 1 dictionary to reduce the arguments by one.
s3 = pd.Series({1:2,2:3,3:4})
s3



0    0.440010
1   -0.650282
2   -0.463790
3    0.246821
4    0.182956
dtype: float64

1    1
2    1
3    1
4    1
5    1
dtype: int64

1    1.576621
2   -0.869742
3   -0.516554
4    2.246195
5    2.224148
dtype: float64

1    a
2    b
3    c
4    d
5    e
dtype: object

1    2
2    3
3    4
dtype: int64

In [18]:
#what is the type of these things? Are they just n x 2 numpy arrays?
type(s2)

#We can access the underlying array if we want.
type(s2.array)

#a test of data alignment: 
s0.array + s1.array

#doing it within the Series datastructure: formatting and indicies are provided by a toStr() like method.
s0 + s1

pandas.core.series.Series

pandas.core.arrays.numpy_.PandasArray

<PandasArray>
[ 2.0166313537028118, -1.5200239474967334, -0.9803443836503829,
   2.493015872019933,   2.407103702152579]
Length: 5, dtype: float64

0         NaN
1    0.926339
2   -1.333532
3   -0.269733
4    2.429150
5         NaN
dtype: float64

In [27]:
#series is also dictionary like:

#direct access
s0[1]

#the index labels act like dictionary keys; we can use the in operator.
1 in s0

#if this was numpy arrays, we would have an error thrown. Pandas uses the dict properties
#of the series in order to 
dAexample = pd.Series(np.random.randn(7),index=['a',2,3,'b','c', 5, 1])
hold = s1 + dAexample
hold

newSeries = hold.rename("abc")
hold.name = "123"

#Not the same animal - a copy has been made.
id(newSeries) == id(hold)

-0.6502822583368477

True

1    1.501605
2   -0.298680
3   -1.047409
4         NaN
5    2.785154
a         NaN
b         NaN
c         NaN
dtype: float64

False

Next lets look at Data Frames:

In [41]:
#If our labelling and dimensions are not the same, pandas will
#pad with NaNs
d0 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df0 = pd.DataFrame(d0)
df0

#what if we dont have matching indicies?
#it makes a double length DF and pads with NaNs!
d1 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3.], index=['d', 'e', 'f'])}
df1 = pd.DataFrame(d1)
df1

#what if we put in crude lists:
#we get default indexing for the rows.
d2 = {'p':[1,2,3],'q':[4,5,6]}
df2 = pd.DataFrame(d2)
df2

#what if we mix structures?
#the indicies that were provided were used for all.
d3 = {'p':[1,2,3],'q':pd.Series([1., 2., 3.], index=['a', 'b', 'c']),'r':np.array([4,5,6])}
df3 = pd.DataFrame(d3)
df3

#finally: same as d3 but with different lengths with no indicies
#Error! arrays must be same length
#there is too much sloppyness to compensate for. pandas
#wont infer/impute for you.
d4 = {'p':[1,2,3,4,5],'q':pd.Series([1., 2., 3.], index=['a', 'b', 'c']),'r':[10,11,12]}
df4 = pd.DataFrame(d4)
df4





Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


Unnamed: 0,one,two
a,1.0,
b,2.0,
c,3.0,
d,,1.0
e,,2.0
f,,3.0


Unnamed: 0,p,q
0,1,4
1,2,5
2,3,6


Unnamed: 0,p,q,r
a,1,1.0,4
b,2,2.0,5
c,3,3.0,6


ValueError: arrays must all be same length

In [45]:
#if you have old labels in data that you don't want used
#you can specify indicies and columns at the DataFrame method
#directly.
#Notice that Pandas no longer takes a union if dimensional information
#can be deduced from other information. the 'd' element is no longer present


d0 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df0 = pd.DataFrame(d0,index=['d', 'b', 'a'], columns=['two', 'three'])
df0

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [47]:
#Ndarrays, lists
#now they must all be the same length (pandas won't take unions)
#and pad.

#List of Dics: Yet another way to do it.
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2, index=['first','second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [63]:
#Insertions and Deletions:
d0 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df0 = pd.DataFrame(d0)


#making a derivative column from other dfs
df0['three'] = df0['one']*df0['two']
df0

#using an inequality to filter the DF
df0['flag'] = df0['two'] > 1.0
df0 

#Using a scalar value to pad a new column, just like as a series:
df0["four"] = 55

#you can dump rows in two different ways:
del df0['flag']
hold = df0.pop('three')
df0

#Notice that we can insert subsets of series, and it will
#pad with nan:
df0['padd'] = df0['one'][:2]
df0

#columns are inserted at the end of the DF by default.

#the assign method: this allows you to insert new derived columns.
#it changes the underlying dataframe so that you can use the 
#new columns in chained calls down the line.
#returns a copy, never touches original data 

hold = df0.assign(eight=lambda x: x['one']*x['four'])
id(df0) == id(hold)
print("df0 looks like:")
df0["one"]["a"] = 1000
df0
print("hold looks like:")
hold

#a new data frame was derived! Does not reference the original

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
b,2.0,2.0,4.0
c,3.0,3.0,9.0
d,,4.0,


Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,True
c,3.0,3.0,9.0,True
d,,4.0,,True


Unnamed: 0,one,two,four
a,1.0,1.0,55
b,2.0,2.0,55
c,3.0,3.0,55
d,,4.0,55


Unnamed: 0,one,two,four,padd
a,1.0,1.0,55,1.0
b,2.0,2.0,55,2.0
c,3.0,3.0,55,
d,,4.0,55,


False

df0 looks like:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,one,two,four,padd
a,1000.0,1.0,55,1.0
b,2.0,2.0,55,2.0
c,3.0,3.0,55,
d,,4.0,55,


hold looks like:


Unnamed: 0,one,two,four,padd,eight
a,1.0,1.0,55,1.0,55.0
b,2.0,2.0,55,2.0,110.0
c,3.0,3.0,55,,165.0
d,,4.0,55,,


In [67]:
#Types of selections:
d0 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df0 = pd.DataFrame(d0)
df0
#single entry/cell:
type(df0["one"]["a"])


Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


numpy.float64

In [72]:
#Next, lets Look in to Grouping, and the Split-Apply-Combine Paradigm
df = pd.DataFrame([('bird', 'Falconiformes', 389.0),
('bird', 'Psittaciformes', 24.0),
('mammal', 'Carnivora', 80.2),
('mammal', 'Primates', np.nan),
('mammal', 'Carnivora', 58)],
index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'],
columns=('class', 'order', 'max_speed'))
df

#a grouping consists of keys and values.
#here, class is the key, and everything else is the value
grouped = df.groupby("class")
grouped2 = df.groupby('order', axis='columns')


Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [85]:
#GroupBy Sorting:
df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]})
df2
grouped = df2.groupby(['X'])
#There is no proper funciton for grouping. So create one later.
#also note that groupings will be sorted by default.

print(grouped["X"]) #no display on its own. You need to pull 
#keys and values.
#Group by returns a valid mapping. It does not change the original 
#DF.
for key, item in grouped:
    print(key)
    print(grouped.get_group(key), "\n\n")
    
#all other attributes are mapped to the keys, including col and row indicies
grouped.groups



Unnamed: 0,X,Y
0,B,1
1,B,2
2,A,3
3,A,4


<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f9ed055c780>
A
   X  Y
2  A  3
3  A  4 


B
   X  Y
0  B  1
1  B  2 




{'A': Int64Index([2, 3], dtype='int64'),
 'B': Int64Index([0, 1], dtype='int64')}

In [91]:
#Multi-Index Grouping:
#Our keys are now tuples. iterating through keys will reveal this.

arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])

df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],'B': np.arange(8)},
index=index)
df
multigroup = df.groupby(['A','B'])


for name, group in multigroup:
    print(name)
    print(group)




Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


(1, 0)
              A  B
first second      
bar   one     1  0
(1, 1)
              A  B
first second      
bar   two     1  1
(1, 2)
              A  B
first second      
baz   one     1  2
(1, 3)
              A  B
first second      
baz   two     1  3
(2, 4)
              A  B
first second      
foo   one     2  4
(2, 5)
              A  B
first second      
foo   two     2  5
(3, 6)
              A  B
first second      
qux   one     3  6
(3, 7)
              A  B
first second      
qux   two     3  7
