# Getting started with pandas

In [40]:
import numpy as np
import pandas as pd

## Playing around with Series

To start, create a series from the following dict:

In [41]:
d = dict({'a': 'Poisson', 'b': 'Binomial', 'c': 'Geometric', 'd': 'Logistic'})
d

{'a': 'Poisson', 'b': 'Binomial', 'c': 'Geometric', 'd': 'Logistic'}

In [42]:
s = pd.Series(d)
s

a      Poisson
b     Binomial
c    Geometric
d     Logistic
dtype: object

Convert the function names to lower case:

In [4]:
s = s.map(lambda x: x.lower())

Now sort the series by value:

In [5]:
s.sort_values(inplace = True)

In [6]:
s

b     binomial
c    geometric
d     logistic
a      poisson
dtype: object

In [7]:
s

b     binomial
c    geometric
d     logistic
a      poisson
dtype: object

## DataFrame: Indexing

Create a 2-dimensional DataFrame with values from a standard normal distribution, in 6 rows and 3 columns, the column names being 'col1', 'col2' and 'col3', and the rows indexed by month names. 

In [8]:
df = pd.DataFrame(np.random.randn(18).reshape(6,3), 
                  index = ['jan', 'feb', 'march', 'april', 'mai', 'june'],
                  columns = ['col1', 'col2', 'col3'])
df

Unnamed: 0,col1,col2,col3
jan,0.551377,0.858104,-0.387179
feb,-0.369639,0.948974,-0.076784
march,0.34649,-0.66658,0.390844
april,-1.212721,1.513246,-0.427556
mai,1.103943,-0.720872,-1.227869
june,1.285799,-0.845788,0.850882


Now, display 
- col1 only
- march only
- col1 of march only
- row 2 only (row 1 in 0-based thinking)
- row 2, col2 only

In [9]:
df['col1']
# or: df.loc[:,'col1']

jan      0.551377
feb     -0.369639
march    0.346490
april   -1.212721
mai      1.103943
june     1.285799
Name: col1, dtype: float64

In [10]:
df.loc['march']

col1    0.346490
col2   -0.666580
col3    0.390844
Name: march, dtype: float64

In [11]:
df.loc['march', 'col1']

0.34648968953574677

In [12]:
df.iloc[1]

col1   -0.369639
col2    0.948974
col3   -0.076784
Name: feb, dtype: float64

In [13]:
df.ix[1, 'col2']
# df.iloc[1,1]

0.94897392787452739

Now add 10 to col2, and multiply march by 7:

In [14]:
df.loc['march'] = df.loc['march'] * 7
df

Unnamed: 0,col1,col2,col3
jan,0.551377,0.858104,-0.387179
feb,-0.369639,0.948974,-0.076784
march,2.425428,-4.66606,2.735905
april,-1.212721,1.513246,-0.427556
mai,1.103943,-0.720872,-1.227869
june,1.285799,-0.845788,0.850882


In [15]:
df['col2'] = df['col2'] + 10
df

Unnamed: 0,col1,col2,col3
jan,0.551377,10.858104,-0.387179
feb,-0.369639,10.948974,-0.076784
march,2.425428,5.33394,2.735905
april,-1.212721,11.513246,-0.427556
mai,1.103943,9.279128,-1.227869
june,1.285799,9.154212,0.850882


Now display statistical summary values:

In [16]:
df.describe()

Unnamed: 0,col1,col2,col3
count,6.0,6.0,6.0
mean,0.630698,9.514601,0.244567
std,1.286805,2.259215,1.392593
min,-1.212721,5.33394,-1.227869
25%,-0.139385,9.185441,-0.417462
50%,0.82766,10.068616,-0.231981
75%,1.240335,10.926256,0.618965
max,2.425428,11.513246,2.735905


Let's shortly explore joins. 

In [17]:
df2 = pd.DataFrame(np.random.randn(18).reshape(6,3), 
                  index = ['jan', 'feb', 'march', 'mai', 'june', 'july'],
                  columns = ['col2', 'col3', 'col4'])
df2

Unnamed: 0,col2,col3,col4
jan,-1.801662,-1.064133,0.967655
feb,0.856567,-1.062014,-1.141427
march,-0.196388,0.633393,-0.831512
mai,-1.808917,0.541732,0.381226
june,0.118262,-1.687769,-1.122145
july,0.923098,-1.454037,0.431288


Use join() to join df and df2 on the index:

In [18]:
# specify how=<...> to change the default join mode from 'left' to e.g. 'inner'
# as we have identically named columns in both dataframes, we need to specify suffixes
j = df.join(df2, lsuffix = 'orig', rsuffix = 'other', how = 'inner')
j

Unnamed: 0,col1,col2orig,col3orig,col2other,col3other,col4
jan,0.551377,10.858104,-0.387179,-1.801662,-1.064133,0.967655
feb,-0.369639,10.948974,-0.076784,0.856567,-1.062014,-1.141427
march,2.425428,5.33394,2.735905,-0.196388,0.633393,-0.831512
mai,1.103943,9.279128,-1.227869,-1.808917,0.541732,0.381226
june,1.285799,9.154212,0.850882,0.118262,-1.687769,-1.122145


Now use merge() to join on columns, doing a database-style join.
To get a nonempty result, we first make some modifications to our dataframes.

In [30]:
df = df.applymap(round)
df

Unnamed: 0,col1,col2,col3
jan,1,11,0
feb,0,11,0
march,2,5,3
april,-1,12,0
mai,1,9,-1
june,1,9,1


In [31]:
df2 = df2.applymap(round)
df2

Unnamed: 0,col2,col3,col4
jan,-2,-1,1
feb,1,-1,-1
march,0,1,-1
mai,-2,1,0
june,0,-2,-1
july,1,-1,0


Now join the modified dataframes on df.col1 == df2.col4.

In [35]:
m = df.merge(df2, left_on='col1', right_on='col4')
m

Unnamed: 0,col1,col2_x,col3_x,col2_y,col3_y,col4
0,1,11,0,-2,-1,1
1,1,9,-1,-2,-1,1
2,1,9,1,-2,-1,1
3,0,11,0,-2,1,0
4,0,11,0,1,-1,0
5,-1,12,0,1,-1,-1
6,-1,12,0,0,1,-1
7,-1,12,0,0,-2,-1


In [43]:
df3 = pd.DataFrame({'col1': ['blue', 'green', 'blue', 'cyan', 'yellow', 'cyan'],
                    'col2': ['yes', 'no', 'no', 'yes', 'no', 'no'],
                    'col3': np.random.randn(100, 20, 6)})

Exception: Data must be 1-dimensional

In [37]:
df

Unnamed: 0,col1,col2,col3
jan,1,11,0
feb,0,11,0
march,2,5,3
april,-1,12,0
mai,1,9,-1
june,1,9,1


In [39]:
df.groupby('col1').mean()

Unnamed: 0_level_0,col2,col3
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,12.0,0
0,11.0,0
1,9.666667,0
2,5.0,3
