In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [5]:
line = np.linspace(-10, 10, num=20)
print line

[-10.          -8.94736842  -7.89473684  -6.84210526  -5.78947368
  -4.73684211  -3.68421053  -2.63157895  -1.57894737  -0.52631579
   0.52631579   1.57894737   2.63157895   3.68421053   4.73684211
   5.78947368   6.84210526   7.89473684   8.94736842  10.        ]


In [7]:
# Series gives a numerical value to each element of the array
line_series = Series(line)
print line_series

0    -10.000000
1     -8.947368
2     -7.894737
3     -6.842105
4     -5.789474
5     -4.736842
6     -3.684211
7     -2.631579
8     -1.578947
9     -0.526316
10     0.526316
11     1.578947
12     2.631579
13     3.684211
14     4.736842
15     5.789474
16     6.842105
17     7.894737
18     8.947368
19    10.000000
dtype: float64


In [12]:
# Notice how the index numbers are different from the values.
# Index is the "indexed" number while the value is the corresponding value
# linked to the index

print line_series.values, "\n"
print line_series.index

[-10.          -8.94736842  -7.89473684  -6.84210526  -5.78947368
  -4.73684211  -3.68421053  -2.63157895  -1.57894737  -0.52631579
   0.52631579   1.57894737   2.63157895   3.68421053   4.73684211
   5.78947368   6.84210526   7.89473684   8.94736842  10.        ] 

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
            19],
           dtype='int64')


In [16]:
# Get a group of numbers
# Notice the double [[]]
line_series[[1, 3, 5]]

1   -8.947368
3   -6.842105
5   -4.736842
dtype: float64

In [17]:
# Get values > -5
line_series[line_series > -5]

# Notice how it gives the index value as well

5     -4.736842
6     -3.684211
7     -2.631579
8     -1.578947
9     -0.526316
10     0.526316
11     1.578947
12     2.631579
13     3.684211
14     4.736842
15     5.789474
16     6.842105
17     7.894737
18     8.947368
19    10.000000
dtype: float64

In [18]:
# You can also manipulate the  values
line_series * 2

0    -20.000000
1    -17.894737
2    -15.789474
3    -13.684211
4    -11.578947
5     -9.473684
6     -7.368421
7     -5.263158
8     -3.157895
9     -1.052632
10     1.052632
11     3.157895
12     5.263158
13     7.368421
14     9.473684
15    11.578947
16    13.684211
17    15.789474
18    17.894737
19    20.000000
dtype: float64

In [21]:
import numpy as np

# Create an exponential funciton
np.exp(line_series)

0    -10.000000
1     -8.947368
2     -7.894737
3     -6.842105
4     -5.789474
5     -4.736842
6     -3.684211
7     -2.631579
8     -1.578947
9     -0.526316
10     0.526316
11     1.578947
12     2.631579
13     3.684211
14     4.736842
15     5.789474
16     6.842105
17     7.894737
18     8.947368
19    10.000000
dtype: float64 
0         0.000045
1         0.000130
2         0.000373
3         0.001068
4         0.003060
5         0.008766
6         0.025117
7         0.071965
8         0.206192
9         0.590778
10        1.692685
11        4.849848
12       13.895693
13       39.813678
14      114.073401
15      326.840958
16      936.458553
17     2683.123399
18     7687.634601
19    22026.465795
dtype: float64


In [23]:
# You can also turn a dictionary into a series
dict_1 = {'a': 1, 'b': 2, 'c': 3}
dict_series = Series(dict_1)
dict_series

a    1
b    2
c    3
dtype: int64

In [24]:
pd.isnull(dict_1)

False

In [25]:
pd.isnull(dict_series)

a    False
b    False
c    False
dtype: bool

Notice how pandas handles the two differently. Once with the dictionary the other with a series.

In [30]:
# You cannot add dict, but you can add series
dict_2 = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
dict_series_2 = Series(dict_2)
dict_series + dict_series_2 

a     2
b     4
c     6
d   NaN
dtype: float64

In [31]:
dict_series.name = "TestName"

In [32]:
dict_series

a    1
b    2
c    3
Name: TestName, dtype: int64

In [33]:
# Now lets label the indexs
dict_series.index.name = "Index Test Name"

In [34]:
dict_series

Index Test Name
a    1
b    2
c    3
Name: TestName, dtype: int64

In [35]:
# Now lets rename the indexes
dict_series.index = ["first", "second", "third"]
dict_series

first     1
second    2
third     3
Name: TestName, dtype: int64

In [37]:
# Now we're going to be looking at some data frames

data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],
          'year' : [2012, 2013, 2014, 2014, 2015],
          'pop' : [5.0, 5.1, 5.2, 4.0, 4.1]}

df_1 = DataFrame(data_1)
df_1

Unnamed: 0,pop,state,year
0,5.0,VA,2012
1,5.1,VA,2013
2,5.2,VA,2014
3,4.0,MD,2014
4,4.1,MD,2015


In [38]:
# If you want to create the DF with a different order to the columns
df_2 = DataFrame(data_1, columns = ['year', 'state', 'pop'])
df_2

Unnamed: 0,year,state,pop
0,2012,VA,5.0
1,2013,VA,5.1
2,2014,VA,5.2
3,2014,MD,4.0
4,2015,MD,4.1


In [39]:
# What if you want to add another col to the DF which has no data
df_3 = DataFrame(data_1, columns = ['year', 'state', 'pop', 'unempl'])
df_3

Unnamed: 0,year,state,pop,unempl
0,2012,VA,5.0,
1,2013,VA,5.1,
2,2014,VA,5.2,
3,2014,MD,4.0,
4,2015,MD,4.1,


In [42]:
# You can return the column values with dot location or bracket location
print df_3.year, df_3['pop']

0    2012
1    2013
2    2014
3    2014
4    2015
Name: year, dtype: int64 0    5.0
1    5.1
2    5.2
3    4.0
4    4.1
Name: pop, dtype: float64


In [43]:
# You can find the row data by using the .ix operator
df_3.ix[2]

year      2014
state       VA
pop        5.2
unempl     NaN
Name: 2, dtype: object

In [44]:
# Now you can update the NaN colum
df_3['unempl'] = np.arange(5)
df_3

Unnamed: 0,year,state,pop,unempl
0,2012,VA,5.0,0
1,2013,VA,5.1,1
2,2014,VA,5.2,2
3,2014,MD,4.0,3
4,2015,MD,4.1,4


In [46]:
# Lets create a duplicate column
df_3['dup'] = df_3['state']
del df_3['dup']
df_3

Unnamed: 0,year,state,pop,unempl
0,2012,VA,5.0,0
1,2013,VA,5.1,1
2,2014,VA,5.2,2
3,2014,MD,4.0,3
4,2015,MD,4.1,4


In [47]:
# DataFrame will will fill in the blanks
# Notice how 2013 only has one entry, so does 2015
pop = {'VA' : {2013 : 5.1, 2014 : 5.2},
       'MD' : {2014 : 4.0, 2015 : 4.1}}
df_4 = DataFrame(pop)
df_4

Unnamed: 0,MD,VA
2013,,5.1
2014,4.0,5.2
2015,4.1,


In [49]:
# You can also transpose the DF
df_4.T

Unnamed: 0,2013,2014,2015
MD,,4.0,4.1
VA,5.1,5.2,


In [50]:
df_4.columns.name = "Years"
df_4.index.name = "States"
df_4

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,,5.1
2014,4.0,5.2
2015,4.1,


In [51]:
df_4.values

array([[ nan,  5.1],
       [ 4. ,  5.2],
       [ 4.1,  nan]])

In [52]:
df_3.values

array([[2012, 'VA', 5.0, 0],
       [2013, 'VA', 5.1, 1],
       [2014, 'VA', 5.2, 2],
       [2014, 'MD', 4.0, 3],
       [2015, 'MD', 4.1, 4]], dtype=object)

In [53]:
df_4.reindex(list(range(2013,2016)))

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,,5.1
2014,4.0,5.2
2015,4.1,


In [55]:
df_4.drop('VA', axis=1)

Years,MD
States,Unnamed: 1_level_1
2013,
2014,4.0
2015,4.1


In [57]:
# If you want to see which cell of a column meets a certain standard
df_4[df_4['MD'] == 4.0]

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,4,5.2


In [58]:
# Shows a scalar comparison
df_4 >= 4.1

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,False,True
2014,False,True
2015,True,False


In [59]:
# Or you can show the actual vales amd the NaN
df_4[df_4 >= 4.1]

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,,5.1
2014,,5.2
2015,4.1,


In [60]:
df_4

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,,5.1
2014,4.0,5.2
2015,4.1,


In [61]:
df_4.ix[df_4.VA < 5.2]

# Notice how the NaN are gone

Years,MD,VA
States,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,,5.1


In [63]:
# Remember a series is different from a dataframe

np.random.seed(0)
ser_6 = Series(np.random.randn(5), index = ['a', 'b', 'c', 'd', 'e'])
ser_6

a    1.764052
b    0.400157
c    0.978738
d    2.240893
e    1.867558
dtype: float64

In [64]:
np.random.seed(1)
ser_7 = Series(np.random.randn(5), index = ['a', 'c', 'e', 'f', 'g'])
ser_7

a    1.624345
c   -0.611756
e   -0.528172
f   -1.072969
g    0.865408
dtype: float64

In [65]:
ser_6 + ser_7

a    3.388398
b         NaN
c    0.366982
d         NaN
e    1.339386
f         NaN
g         NaN
dtype: float64

In [69]:
# Set a fill value instead of NaN for indices that do not overlap:
# The fill_value adds extra digits to the indicies that do not overlap
ser_6.add(ser_7, fill_value=0)

a    3.388398
b    0.400157
c    0.366982
d    2.240893
e    1.339386
f   -1.072969
g    0.865408
dtype: float64

In [70]:
np.random.seed(10)
# reshape() --> Gives a new shape to an array without changing its data.
df_8 = DataFrame(np.random.rand(9).reshape((3, 3)),
                 columns=['a', 'b', 'c'])
df_8

Unnamed: 0,a,b,c
0,0.771321,0.020752,0.633648
1,0.748804,0.498507,0.224797
2,0.198063,0.760531,0.169111


In [75]:
np.random.seed(11)
# reshape() --> Gives a new shape to an array without changing its data.
df_9 = DataFrame(np.random.rand(9).reshape((3, 3)),
                 columns=['b', 'c', 'd'])
df_9

Unnamed: 0,b,c,d
0,0.18027,0.019475,0.463219
1,0.724934,0.420204,0.485427
2,0.012781,0.487372,0.941807


In [76]:
df_8 + df_9

Unnamed: 0,a,b,c,d
0,,0.201022,0.653123,
1,,1.223441,0.645,
2,,0.773312,0.656482,


In [78]:
df_10 = df_8.add(df_9, fill_value = 0)
df_10

Unnamed: 0,a,b,c,d
0,0.771321,0.201022,0.653123,0.463219
1,0.748804,1.223441,0.645,0.485427
2,0.198063,0.773312,0.656482,0.941807


In [81]:
# The dataFrame now becomes a series
df_10_1 = df_10.ix[0]

In [83]:
df_10 - df_10_1

Unnamed: 0,a,b,c,d
0,0.0,0.0,0.0,0.0
1,-0.022517,1.022419,-0.008123,0.022209
2,-0.573258,0.57229,0.003359,0.478588


In [85]:
# If you want a Series - not at DF - simply write out Series()
ser_10 = Series([100, 200, 300])
ser_10

0    100
1    200
2    300
dtype: int64

In [88]:
# Remember you can remove a scaler from the DF
df_10.sub(ser_10, axis = 0)

Unnamed: 0,a,b,c,d
0,-99.228679,-99.798978,-99.346877,-99.536781
1,-199.251196,-198.776559,-199.355,-199.514573
2,-299.801937,-299.226688,-299.343518,-299.058193


In [90]:
df_11 = np.abs(df_10)
df_11

Unnamed: 0,a,b,c,d
0,0.771321,0.201022,0.653123,0.463219
1,0.748804,1.223441,0.645,0.485427
2,0.198063,0.773312,0.656482,0.941807


In [92]:
func_1 = lambda x: x.max() - x.min()
df_11.apply(func_1)

a    0.573258
b    1.022419
c    0.011482
d    0.478588
dtype: float64

In [93]:
func_2 = lambda x: x.max() - 2
df_11.apply(func_2)

a   -1.228679
b   -0.776559
c   -1.343518
d   -1.058193
dtype: float64

Notice how the lambda function takes in the DF and returns a series

In [94]:
func_3 = lambda x: Series([x.min(), x.max()], index = ['min', 'max'])
df_11.apply(func_3)

Unnamed: 0,a,b,c,d
min,0.198063,0.201022,0.645,0.463219
max,0.771321,1.223441,0.656482,0.941807


In [97]:
# Remember to reduce the number of decimal places you map the functions
# to .3f or .2f or .xf

# This is also applymap() --> works on a element-wise basis on a DataFrame
func_4 = lambda x: '%.4f' %x
df_11.applymap(func_4)

Unnamed: 0,a,b,c,d
0,0.7713,0.201,0.6531,0.4632
1,0.7488,1.2234,0.645,0.4854
2,0.1981,0.7733,0.6565,0.9418


In [98]:
# Now if you just want to run it on a series, you use map()

#map() --> works element-wise on a Series
df_11['a'].map(func_4)

0    0.7713
1    0.7488
2    0.1981
Name: a, dtype: object

In [105]:
ser_10.sort_index()

# You can sort by the index value

0    100
1    200
2    300
dtype: int64

In [107]:
ser_10.sort_values()

# You can sort by the values in the cell

0    100
1    200
2    300
dtype: int64

In [111]:
df_rd = DataFrame(np.arange(25).reshape((5,5)), index = ['b','a','d','f','e'],
                 columns=['second', 'first', 'fifth', 'fourth', 'third'])
df_rd

Unnamed: 0,second,first,fifth,fourth,third
b,0,1,2,3,4
a,5,6,7,8,9
d,10,11,12,13,14
f,15,16,17,18,19
e,20,21,22,23,24


In [119]:
# sort_index --> axis (index, columns to direct sorting), ascending)
# axis = 0 is Y
df_rd.sort_index(axis=0, ascending = True)

Unnamed: 0,second,first,fifth,fourth,third
a,5,6,7,8,9
b,0,1,2,3,4
d,10,11,12,13,14
e,20,21,22,23,24
f,15,16,17,18,19


In [120]:
df_rd.sort_index(axis=1, ascending = True)
# axis = 1 is X

Unnamed: 0,fifth,first,fourth,second,third
b,2,1,3,0,4
a,7,6,8,5,9
d,12,11,13,10,14
f,17,16,18,15,19
e,22,21,23,20,24


In [124]:
ser_11 = Series([1,2,3,5,6,3,7,9,2123,56,234,-234,32])
ser_11 = ser_11.sort_values()
ser_11

11    -234
0        1
1        2
2        3
5        3
3        5
4        6
6        7
7        9
12      32
9       56
10     234
8     2123
dtype: int64

In [125]:
ser_11.rank()

11     1.0
0      2.0
1      3.0
2      4.5
5      4.5
3      6.0
4      7.0
6      8.0
7      9.0
12    10.0
9     11.0
10    12.0
8     13.0
dtype: float64

In [126]:
df_13 = DataFrame({'foo' : [7, -5, 7, 4, 2, 0, 4, 7],
                   'bar' : [-5, 4, 2, 0, 4, 7, 7, 8],
                   'baz' : [-1, 2, 3, 0, 5, 9, 9, 5]})
df_13

Unnamed: 0,bar,baz,foo
0,-5,-1,7
1,4,2,-5
2,2,3,7
3,0,0,4
4,4,5,2
5,7,9,0
6,7,9,4
7,8,5,7


In [128]:
df_13.rank()

Unnamed: 0,bar,baz,foo
0,1.0,1.0,7.0
1,4.5,3.0,1.0
2,3.0,4.0,7.0
3,2.0,2.0,4.5
4,4.5,5.5,3.0
5,6.5,7.5,2.0
6,6.5,7.5,4.5
7,8.0,5.5,7.0


In [132]:
df_13.sum()

bar    27
baz    32
foo    26
dtype: int64

In [135]:
df_13.sum(axis=1)

# This sums the rows, no the columns

0     1
1     1
2    12
3     4
4    11
5    16
6    20
7    20
dtype: int64

In [136]:
data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],
          'year' : [2012, 2013, 2014, 2014, 2015],
          'population' : [5.0, 5.1, 5.2, 4.0, 4.1]}
df_1 = DataFrame(data_1)
df_1

Unnamed: 0,population,state,year
0,5.0,VA,2012
1,5.1,VA,2013
2,5.2,VA,2014
3,4.0,MD,2014
4,4.1,MD,2015


In [139]:
df_1.replace('VA', 'Virginia', inplace=True)
df_1

Unnamed: 0,population,state,year
0,5.0,Virginia,2012
1,5.1,Virginia,2013
2,5.2,Virginia,2014
3,4.0,MD,2014
4,4.1,MD,2015


In [140]:
# What if you want to just edit one column
df_1.replace({'state' : { 'MD' : 'Maryland' }}, inplace=True)
df_1

Unnamed: 0,population,state,year
0,5.0,Virginia,2012
1,5.1,Virginia,2013
2,5.2,Virginia,2014
3,4.0,Maryland,2014
4,4.1,Maryland,2015


In [141]:
df_2 = df_1.drop('population', axis = 1)
df_2

Unnamed: 0,state,year
0,Virginia,2012
1,Virginia,2013
2,Virginia,2014
3,Maryland,2014
4,Maryland,2015


In [142]:
# Concat with pandas adds more rows to the bottom of one of the df

data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'],
          'year' : [2012, 2013, 2014, 2014, 2015],
          'population' : [6.0, 6.1, 6.2, 3.0, 3.1]}
df_3 = DataFrame(data_2)
df_3

Unnamed: 0,population,state,year
0,6.0,NY,2012
1,6.1,NY,2013
2,6.2,NY,2014
3,3.0,FL,2014
4,3.1,FL,2015


In [143]:
df_concat = pd.concat([df_3, df_2])
df_concat

Unnamed: 0,population,state,year
0,6.0,NY,2012
1,6.1,NY,2013
2,6.2,NY,2014
3,3.0,FL,2014
4,3.1,FL,2015
0,,Virginia,2012
1,,Virginia,2013
2,,Virginia,2014
3,,Maryland,2014
4,,Maryland,2015
