In [1]:
#Author:Prateek Kumar
#Welcome to Pandas Overview
#This notebook provides a quick overview of some of the functionalities of Pandas

In [3]:
##Creating an object in Pandas:
import pandas as pd 
import numpy as np
series = pd.Series([10,20,30,40,np.nan,50])
series

0    10.0
1    20.0
2    30.0
3    40.0
4     NaN
5    50.0
dtype: float64

In [10]:
##Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
#Creating the range:
dates = pd.date_range('20190101', periods=5)
dates
#Creating the dataframe:
df = pd.DataFrame(np.random.randn(5,5), index=dates, columns=list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
2019-01-01,0.006935,0.461974,1.051642,-0.497084,0.522261
2019-01-02,-1.077388,0.217421,-1.497766,-1.385251,0.554756
2019-01-03,0.244553,-2.501543,0.141142,-0.335015,0.164692
2019-01-04,-1.896036,0.397065,1.354163,0.568667,0.400316
2019-01-05,1.735372,-0.337229,-0.384871,-0.590829,-1.581412


In [17]:
##Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({'A': pd.Series(1, range(4), dtype='float32'),
                   'B': pd.Timestamp('20190101'),
                   'C': 1.0,
                   'D': np.array([3]*4),
                   'E': pd.Categorical(["Test","Train","Test","Train"]),
                   'F': 'foo'})
df2


Unnamed: 0,A,B,C,D,E,F
0,1.0,2019-01-01,1.0,3,Test,foo
1,1.0,2019-01-01,1.0,3,Train,foo
2,1.0,2019-01-01,1.0,3,Test,foo
3,1.0,2019-01-01,1.0,3,Train,foo


In [25]:
##Datatypes we have in df2:
df2.dtypes

A           float32
B    datetime64[ns]
C           float64
D             int32
E          category
F            object
dtype: object

In [34]:
#Viewing data from Top:
df.head()

Unnamed: 0,A,B,C,D,E
2019-01-01,2.431367,-0.927792,-1.194827,-0.211244,0.205602
2019-01-02,-0.399373,0.266713,-0.291619,-0.532092,0.611213
2019-01-03,-1.366881,0.329066,0.119593,-0.163184,0.0914
2019-01-04,-0.229499,0.818679,0.667113,-0.522301,0.606303
2019-01-05,0.254685,0.081618,0.864098,0.015784,-0.621372


In [36]:
#Viewing from Bottom:
df.tail(3)

Unnamed: 0,A,B,C,D,E
2019-01-03,-1.366881,0.329066,0.119593,-0.163184,0.0914
2019-01-04,-0.229499,0.818679,0.667113,-0.522301,0.606303
2019-01-05,0.254685,0.081618,0.864098,0.015784,-0.621372


In [5]:
##Viewing Statistics about data:
df.describe()

Unnamed: 0,A,B,C,D,E
count,5.0,5.0,5.0,5.0,5.0
mean,-1.016783,-0.069601,0.250003,0.07105,0.527266
std,0.561005,0.531811,0.529203,0.917021,0.919931
min,-1.91524,-0.532938,-0.555517,-0.968133,-0.68073
25%,-1.168033,-0.402522,0.07502,-0.59756,-0.198525
50%,-0.862101,-0.222363,0.307144,0.109184,0.876944
75%,-0.61587,-0.002599,0.636924,0.436076,1.214214
max,-0.522669,0.812414,0.786445,1.375685,1.424425


In [13]:
##Transposing:
df.transpose()

Unnamed: 0,2019-01-01 00:00:00,2019-01-02 00:00:00,2019-01-03 00:00:00,2019-01-04 00:00:00,2019-01-05 00:00:00
A,0.006935,-1.077388,0.244553,-1.896036,1.735372
B,0.461974,0.217421,-2.501543,0.397065,-0.337229
C,1.051642,-1.497766,0.141142,1.354163,-0.384871
D,-0.497084,-1.385251,-0.335015,0.568667,-0.590829
E,0.522261,0.554756,0.164692,0.400316,-1.581412


In [7]:
##Sorting by Axis:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D,E
2019-01-01,-0.862101,-0.402522,0.307144,0.109184,0.876944
2019-01-02,-0.522669,-0.532938,0.636924,0.436076,1.424425
2019-01-03,-1.91524,-0.002599,0.786445,-0.968133,1.214214
2019-01-04,-1.168033,-0.222363,0.07502,1.375685,-0.68073
2019-01-05,-0.61587,0.812414,-0.555517,-0.59756,-0.198525


In [8]:
##Sorting by value:
df.sort_values(by=['A','C'])

Unnamed: 0,A,B,C,D,E
2019-01-03,-1.91524,-0.002599,0.786445,-0.968133,1.214214
2019-01-04,-1.168033,-0.222363,0.07502,1.375685,-0.68073
2019-01-01,-0.862101,-0.402522,0.307144,0.109184,0.876944
2019-01-05,-0.61587,0.812414,-0.555517,-0.59756,-0.198525
2019-01-02,-0.522669,-0.532938,0.636924,0.436076,1.424425


In [14]:
##Getting:
df['A']

2019-01-01    0.006935
2019-01-02   -1.077388
2019-01-03    0.244553
2019-01-04   -1.896036
2019-01-05    1.735372
Freq: D, Name: A, dtype: float64

In [None]:
##Selection:

In [15]:
##Selecting via [], which slices the rows:
df[0:3]

Unnamed: 0,A,B,C,D,E
2019-01-01,0.006935,0.461974,1.051642,-0.497084,0.522261
2019-01-02,-1.077388,0.217421,-1.497766,-1.385251,0.554756
2019-01-03,0.244553,-2.501543,0.141142,-0.335015,0.164692


In [28]:
##For getting a cross section using a label:
df.loc[dates[0]]

A    0.006935
B    0.461974
C    1.051642
D   -0.497084
E    0.522261
Name: 2019-01-01 00:00:00, dtype: float64

In [29]:
##Selecting on a multi-axis by label:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2019-01-01,0.006935,0.461974
2019-01-02,-1.077388,0.217421
2019-01-03,0.244553,-2.501543
2019-01-04,-1.896036,0.397065
2019-01-05,1.735372,-0.337229


In [39]:
##Reduction in the dimensions of the returned object:
df.loc['2019-01-03':,['A','B']]

Unnamed: 0,A,B
2019-01-03,0.244553,-2.501543
2019-01-04,-1.896036,0.397065
2019-01-05,1.735372,-0.337229


In [None]:
##Selection by Position:

In [42]:
#Select via the position of the passed integers:
df.iloc[2]

A    0.244553
B   -2.501543
C    0.141142
D   -0.335015
E    0.164692
Name: 2019-01-03 00:00:00, dtype: float64

In [53]:
##Selection by integer slices:
df.iloc[2:4, 0:3]    

Unnamed: 0,A,B,C
2019-01-03,0.244553,-2.501543,0.141142
2019-01-04,-1.896036,0.397065,1.354163


In [55]:
##Selection by lists of integer position locations:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2019-01-02,-1.077388,-1.497766
2019-01-03,0.244553,0.141142
2019-01-05,1.735372,-0.384871


In [None]:
##Boolean Indexing

In [58]:
##Using a single column’s values to select data:
df[df.A > 0]

Unnamed: 0,A,B,C,D,E
2019-01-01,0.006935,0.461974,1.051642,-0.497084,0.522261
2019-01-03,0.244553,-2.501543,0.141142,-0.335015,0.164692
2019-01-05,1.735372,-0.337229,-0.384871,-0.590829,-1.581412


In [59]:
##Selecting values from a DataFrame where a boolean condition is met:
df[df > 0]

Unnamed: 0,A,B,C,D,E
2019-01-01,0.006935,0.461974,1.051642,,0.522261
2019-01-02,,0.217421,,,0.554756
2019-01-03,0.244553,,0.141142,,0.164692
2019-01-04,,0.397065,1.354163,0.568667,0.400316
2019-01-05,1.735372,,,,


In [67]:
##Using the isin() method for filtering:
df3=df.copy()
df3['E']=['one','two','three','one','two']
df[df3['E'].isin(['one'])]

Unnamed: 0,A,B,C,D,E
2019-01-01,0.006935,0.461974,1.051642,-0.497084,0.522261
2019-01-04,-1.896036,0.397065,1.354163,0.568667,0.400316


In [None]:
##Missing Data:

In [107]:
#To drop any rows that have missing data:data.iloc[[0,3,6,24], [0,5,6]]
df4=pd.DataFrame(np.random.randn(5,5), columns=list('ABCDE'))
df4['E']='NaN'
df4


Unnamed: 0,A,B,C,D,E
0,0.679495,1.325161,-0.837164,-0.097603,
1,0.154308,1.37782,1.425324,1.289829,
2,1.27913,0.687199,0.129051,1.008526,
3,-0.088479,-0.868223,0.017421,-0.389674,
4,-1.22071,-0.597094,-0.063997,-1.862062,
