In [3]:
import pandas as pd
import numpy as np

# Series

In [19]:
mydata = [10,20,30,40]
mylabel = ['a','b','c','d']

ser1 = pd.Series(data=mydata, index=mylabel)  # using list and data labels
ser1

a    10
b    20
c    30
d    40
dtype: int64

In [21]:
arr = np.array([19,20,30,41])  # using numpy array

In [22]:
pd.Series(data=arr)

0    19
1    20
2    30
3    41
dtype: int64

In [23]:
d = {'a':11,'b':12,'c':14}  # using a dictionary
d

{'a': 11, 'b': 12, 'c': 14}

In [24]:
pd.Series(d)

a    11
b    12
c    14
dtype: int64

In [25]:
## Getting the result

ser1['a']

10

In [26]:
ser1['b']

20

# Dataframe

In [27]:
from numpy.random import randn

np.random.seed(101)

In [28]:
df = pd.DataFrame(data=randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [29]:
# selection and indexing in dataframe

df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [30]:
type(df['W'])

pandas.core.series.Series

In [34]:
# based on the loc
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [35]:
# creating a new column

df['New'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [36]:
df.drop('E',axis=0,inplace=True)  ## axis = 0 is for the index and 1 for the columns
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542


In [37]:
df.shape  # returns a tuple which is why you have axis = 0 and 1 respectively

(4, 5)

In [38]:
# conditional selection

df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542


In [40]:
df['Z']>0

A     True
B     True
C    False
D     True
Name: Z, dtype: bool

In [41]:
df[df['Z']>0]

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542


In [42]:
df[df['Z']>0]['Y']

A    0.907969
B   -0.848077
D   -0.933237
Name: Y, dtype: float64

In [45]:
## index reset

df.reset_index()  # use inplace=True for the index to be inplaced

Unnamed: 0,index,W,X,Y,Z,New
0,A,2.70685,0.628133,0.907969,0.503826,3.614819
1,B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
2,C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
3,D,0.188695,-0.758872,-0.933237,0.955057,-0.744542


In [48]:
newind = 'CA NY WY OR'.split()
newind

['CA', 'NY', 'WY', 'OR']

In [49]:
df['newind'] = newind ## note: the length of newind should match the length of index else ERROR

In [50]:
df

Unnamed: 0,W,X,Y,Z,New,newind
A,2.70685,0.628133,0.907969,0.503826,3.614819,CA
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959,NY
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355,WY
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542,OR


In [51]:
df.set_index(df['newind'])

Unnamed: 0_level_0,W,X,Y,Z,New,newind
newind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,2.70685,0.628133,0.907969,0.503826,3.614819,CA
NY,0.651118,-0.319318,-0.848077,0.605965,-0.196959,NY
WY,-2.018168,0.740122,0.528813,-0.589001,-1.489355,WY
OR,0.188695,-0.758872,-0.933237,0.955057,-0.744542,OR


In [52]:
# MultiIndex and hierarchy in dataframe

# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [53]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [57]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.497104,-0.75407
G1,2,-0.943406,0.484752
G1,3,-0.116773,1.901755
G2,1,0.238127,1.996652
G2,2,-0.993263,0.1968
G2,3,-1.136645,0.000366


In [58]:
df.index.names

FrozenList([None, None])

In [59]:
df.index.names = ['Group','Num']

df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.497104,-0.75407
G1,2,-0.943406,0.484752
G1,3,-0.116773,1.901755
G2,1,0.238127,1.996652
G2,2,-0.993263,0.1968
G2,3,-1.136645,0.000366


In [60]:
df.loc['G1']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.497104,-0.75407
2,-0.943406,0.484752
3,-0.116773,1.901755


In [61]:
df.loc['G1'].loc[1]

A   -0.497104
B   -0.754070
Name: 1, dtype: float64

In [62]:
df.loc['G1'].loc[1]['B']

-0.7540697010400628

In [63]:
df.xs('G1') ## new special func --- xs

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.497104,-0.75407
2,-0.943406,0.484752
3,-0.116773,1.901755


In [65]:
df.xs(['G1',2])

A   -0.943406
B    0.484752
Name: (G1, 2), dtype: float64

In [66]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.497104,-0.75407
G2,0.238127,1.996652
