# Pandas Basics

# Pandas Series

In [2]:
import numpy as np
import pandas as pd

In [6]:
labels = ['a', 'b', 'c']
my_list = [1,2,3]
arr = np.array(my_list)
d = {'a':1, 'b':2, 'c':3} 

In [7]:
labels

['a', 'b', 'c']

In [8]:
my_list

[1, 2, 3]

In [9]:
arr

array([1, 2, 3])

In [10]:
d

{'a': 1, 'b': 2, 'c': 3}

In [11]:
type(labels)

list

In [14]:
pd.Series(data=my_list)

0    1
1    2
2    3
dtype: int64

In [17]:
pd.Series(my_list, labels)

a    1
b    2
c    3
dtype: int64

In [18]:
type(pd.Series(data=my_list))

pandas.core.series.Series

In [19]:
pd.Series(arr, labels)

a    1
b    2
c    3
dtype: int32

In [20]:
pd.Series(d)

a    1
b    2
c    3
dtype: int64

In [21]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [22]:
pd.Series(data=[sum, list])

0    <built-in function sum>
1             <class 'list'>
dtype: object

In [23]:
ser1 = pd.Series(data=[1,2,3,4],index=['a','b','c','d'])

In [24]:
ser1

a    1
b    2
c    3
d    4
dtype: int64

In [25]:
ser2 = pd.Series(data=[1,2,5,4],index=['a','b','e','d'])

In [26]:
ser2

a    1
b    2
e    5
d    4
dtype: int64

In [27]:
ser1['a']

1

In [28]:
type(ser1['a'])

numpy.int64

In [29]:
ser3 = pd.Series(data=labels)

In [30]:
ser3

0    a
1    b
2    c
dtype: object

In [31]:
ser3[2]

'c'

In [34]:
ser1 + ser2

a    2.0
b    4.0
c    NaN
d    8.0
e    NaN
dtype: float64

# Pandas Dataframes

In [37]:
from numpy.random import randn

In [39]:
np.random.seed(101) #Seed is for getting a particular set of random integers across different systems

In [40]:
df = pd.DataFrame(randn(5,4))

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
0    5 non-null float64
1    5 non-null float64
2    5 non-null float64
3    5 non-null float64
dtypes: float64(4)
memory usage: 288.0 bytes


In [44]:
df.columns

RangeIndex(start=0, stop=4, step=1)

In [46]:
df.describe()

Unnamed: 0,0,1,2,3
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


In [47]:
df.head(5)

Unnamed: 0,0,1,2,3
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [48]:
df.corr()

Unnamed: 0,0,1,2,3
0,1.0,-0.076661,0.039035,0.628925
1,-0.076661,1.0,0.983148,-0.263407
2,0.039035,0.983148,1.0,-0.131403
3,0.628925,-0.263407,-0.131403,1.0


In [49]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])

In [51]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


<b>Each of W,X,Y,Z columns are pandas series, and they all share a common index<b/>

In [52]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [53]:
type(df['W'])

pandas.core.series.Series

In [54]:
type(df)

pandas.core.frame.DataFrame

In [55]:
df.W

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

<b>For returning multiple columns, pass in a list of columns in double brackets<b/>

In [56]:
df[['W','Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905
C,0.807706,0.638787
D,-0.497104,-0.943406
E,-0.116773,0.238127


In [57]:
type(df[['W','Y']])

pandas.core.frame.DataFrame

<b>As seen from above, if you ask for a single column, you will get back a series, for multiple columns you get a DataFrame<b/>

In [58]:
df['NEW'] = df['W'] + df['Y']

In [59]:
df

Unnamed: 0,W,X,Y,Z,NEW
A,0.302665,1.693723,-1.706086,-1.159119,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,0.032064
C,0.807706,0.07296,0.638787,0.329646,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,0.121354


In [61]:
df.drop('NEW', axis=1)

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [62]:
df

Unnamed: 0,W,X,Y,Z,NEW
A,0.302665,1.693723,-1.706086,-1.159119,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,0.032064
C,0.807706,0.07296,0.638787,0.329646,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,0.121354


<b>inplace is used by default to not loose data accidentally<b/>

In [63]:
df.drop('NEW', axis=1, inplace=True)

In [64]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [65]:
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [66]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [67]:
df.drop('E', inplace=True)

In [68]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [69]:
df.shape

(4, 4)

In [70]:
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

<b>Both rows and columns are pandas series<b/>

In [72]:
df.loc[['A','B']]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502


In [74]:
df.iloc[2]

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [75]:
df.iloc[[0,1]]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502


<b>loc for location based index and iloc for numerical based index<b/>

In [76]:
df.loc['B','Y']

0.16690463609281317

<b>Conditional selection<b/>

In [77]:
booldf = df>0

In [78]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,False,True,True,True
C,True,True,True,True
D,False,False,False,True


In [79]:
df[booldf]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,,
B,,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,,,,0.484752


In [80]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [81]:
df['W']>0

A     True
B    False
C     True
D    False
Name: W, dtype: bool

In [82]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
Name: W, dtype: float64

In [83]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
C,0.807706,0.07296,0.638787,0.329646


In [84]:
df[df['Z']>0]

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [85]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119


In [87]:
df[df['Z']>0]['X']

B    0.390528
C    0.072960
D   -0.754070
Name: X, dtype: float64

In [88]:
df[df['Z']>0][['X','Y']]

Unnamed: 0,X,Y
B,0.390528,0.166905
C,0.07296,0.638787
D,-0.75407,-0.943406


In [90]:
df[(df['Z']>0) & (df['X']>0)]

Unnamed: 0,W,X,Y,Z
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646


<b>Use ampersand(&) for multiple conditions instead of and keyword<b/>

In [92]:
df[(df['Z']>0) | (df['X']>0)]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [93]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [96]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.302665,1.693723,-1.706086,-1.159119
1,B,-0.134841,0.390528,0.166905,0.184502
2,C,0.807706,0.07296,0.638787,0.329646
3,D,-0.497104,-0.75407,-0.943406,0.484752


In [97]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752


In [98]:
newind = 'AB CD EF GH'.split()

In [99]:
newind

['AB', 'CD', 'EF', 'GH']

In [100]:
df['States'] = newind

In [101]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,AB
B,-0.134841,0.390528,0.166905,0.184502,CD
C,0.807706,0.07296,0.638787,0.329646,EF
D,-0.497104,-0.75407,-0.943406,0.484752,GH


In [102]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AB,0.302665,1.693723,-1.706086,-1.159119
CD,-0.134841,0.390528,0.166905,0.184502
EF,0.807706,0.07296,0.638787,0.329646
GH,-0.497104,-0.75407,-0.943406,0.484752


In [104]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.302665,1.693723,-1.706086,-1.159119,AB
B,-0.134841,0.390528,0.166905,0.184502,CD
C,0.807706,0.07296,0.638787,0.329646,EF
D,-0.497104,-0.75407,-0.943406,0.484752,GH
