In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

In [3]:

#The simplest Series is formed from only an array of data:
obj=pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj.index #range

RangeIndex(start=0, stop=4, step=1)

In [6]:
#Often it will be desirable to create a Series with an
#index identifying each data point with a label:
obj2=pd.Series([4, 7, -5, 3], index=['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [7]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
#Compared with NumPy arrays, you can use labels in the index when selecting single
#values or a set of values:
obj2['d']

3

In [9]:
obj2['c']=6 #change value of index
obj2

a    4
b    7
c    6
d    3
dtype: int64

In [10]:
obj2[obj2>0]

a    4
b    7
c    6
d    3
dtype: int64

In [11]:
obj2*2

a     8
b    14
c    12
d     6
dtype: int64

In [12]:
import numpy as np
np.exp(obj2)

a      54.598150
b    1096.633158
c     403.428793
d      20.085537
dtype: float64

In [13]:
#a fixed-length, ordered dict, as it is a mapping of 
#index values to data values


In [14]:
'b' in obj2 #find b in obj2

True

In [15]:
'e' in obj2

False

In [16]:
#Should you have data contained in a Python dict, 
#you can create a Series from it by passing the dict:
sdata={'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3=pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [17]:
states=['California', 'Ohio', 'Oregon', 'Texas']
obj4=pd.Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [18]:
#isnull and notnull functions in pandas should be used to detect missing data:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [19]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [20]:
#Series also has these as instance methods:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [21]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [22]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [23]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [24]:
obj4.name='population'
obj4.index.name='state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [25]:
#A Series’s index can be altered in-place by assignment:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [26]:
obj.index=['Bob','Steve','Jeff','Ryen']
obj

Bob      4
Steve    7
Jeff    -5
Ryen     3
dtype: int64

In [27]:
#There are many ways to construct a DataFrame, though one of the most common is
#from a dict of equal-length lists or NumPy arrays:

data={'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame=pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [28]:
#For large DataFrames, the head method selects only the first five rows:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [29]:
#If you specify a sequence of columns, the DataFrame’s columns will be
#arranged in that order:
pd.DataFrame(data,columns=['year','state','pop'])


Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [30]:
#If you pass a column that isn’t contained in the dict,
#it will appear with missing values in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
  index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [31]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [32]:
#A column in a DataFrame can be retrieved as a Series either by dict-like 
#notation or by attribute:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [33]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [38]:
#GroupBy Mechanics
df=pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
 'key2' : ['one', 'two', 'one', 'two', 'one'],
 'data1' : np.random.randn(5),
 'data2' : np.random.randn(5)})
df


Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.642681,0.489789
1,a,two,-1.120953,0.023136
2,b,one,0.159793,-0.251635
3,b,two,-0.999209,-0.947539
4,a,one,1.433256,-1.165136


In [41]:
grouped=df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002236A9AA820>

In [43]:
#means method
grouped.mean()

key1
a   -0.110126
b   -0.419708
Name: data1, dtype: float64

In [64]:
#means=df['data1'].groupby([df['key1'],df['key2']]).means()
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2
a     one     0.395287
      two    -1.120953
b     one     0.159793
      two    -0.999209
Name: data1, dtype: float64

In [65]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.395287,-1.120953
b,0.159793,-0.999209
