In [1]:
import numpy as np
import pandas as pd


# Basic Functions in Dataframes

In [2]:
dataframe=pd.DataFrame(np.random.randn(3,5),index=[100,200,300],columns=['a','b','c','d','e'])
dataframe

Unnamed: 0,a,b,c,d,e
100,1.421122,-1.374538,-0.063417,-0.117312,1.074113
200,0.274831,0.508763,0.4462,-1.853634,0.073739
300,-0.500333,2.127118,-0.387522,0.80053,0.575542


In [3]:
# tail and head: throws last and first n rows respectively
dataframe.head(2)

Unnamed: 0,a,b,c,d,e
100,1.421122,-1.374538,-0.063417,-0.117312,1.074113
200,0.274831,0.508763,0.4462,-1.853634,0.073739


In [4]:
dataframe.tail(2)

Unnamed: 0,a,b,c,d,e
200,0.274831,0.508763,0.4462,-1.853634,0.073739
300,-0.500333,2.127118,-0.387522,0.80053,0.575542


In [5]:
dataframe.axes #returns the index and column names in a list

[Int64Index([100, 200, 300], dtype='int64'),
 Index(['a', 'b', 'c', 'd', 'e'], dtype='object')]

In [6]:
dataframe.empty #returns true if df is empty

False

In [7]:
dataframe.columns #returns column names in a list

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [8]:
dataframe.index  #returns index labels names in a list

Int64Index([100, 200, 300], dtype='int64')

In [9]:
dataframe.dtypes #returns the type 

a    float64
b    float64
c    float64
d    float64
e    float64
dtype: object

In [10]:
dataframe.shape #returns the shape of the df

(3, 5)

In [11]:
dataframe.size #returns total no. of columns

15

In [12]:
dataframe.values #returns the elements in the df

array([[ 1.42112211, -1.3745382 , -0.06341698, -0.11731226,  1.07411341],
       [ 0.27483113,  0.50876263,  0.44620034, -1.85363383,  0.07373924],
       [-0.50033342,  2.12711824, -0.38752214,  0.80053016,  0.57554245]])

# Stats in the Dataframe

In [13]:
df=dataframe

In [14]:
df

Unnamed: 0,a,b,c,d,e
100,1.421122,-1.374538,-0.063417,-0.117312,1.074113
200,0.274831,0.508763,0.4462,-1.853634,0.073739
300,-0.500333,2.127118,-0.387522,0.80053,0.575542


In [15]:
#sum

df.sum()  #gives sum of the values as per the axis mentioned. By Default axis =0 , it is giving sum index wise

a    1.195620
b    1.261343
c   -0.004739
d   -1.170416
e    1.723395
dtype: float64

In [16]:
df.sum(axis=1) #sum column wise

100    0.939968
200   -0.550100
300    2.615335
dtype: float64

In [17]:
#mean

df.mean()

a    0.398540
b    0.420448
c   -0.001580
d   -0.390139
e    0.574465
dtype: float64

In [18]:
df.mean(axis=1)

100    0.187994
200   -0.110020
300    0.523067
dtype: float64

So , like this we can do 

df.min

df.max

df.idxmax

df.idxmin and so on.

In [19]:
#Summary of the dataframe
df.describe()

Unnamed: 0,a,b,c,d,e
count,3.0,3.0,3.0,3.0,3.0
mean,0.39854,0.420448,-0.00158,-0.390139,0.574465
std,0.966683,1.752498,0.420287,1.347951,0.500188
min,-0.500333,-1.374538,-0.387522,-1.853634,0.073739
25%,-0.112751,-0.432888,-0.22547,-0.985473,0.324641
50%,0.274831,0.508763,-0.063417,-0.117312,0.575542
75%,0.847977,1.31794,0.191392,0.341609,0.824828
max,1.421122,2.127118,0.4462,0.80053,1.074113


# How to use other library's functions or user defined function in pandas?

We can use the other functions using the following three functions of pandas:

.pipe

.apply

.applymap

### pd.pipe(): this is used when we want to other functions on the whole dataframes in all.

In [26]:
#get mean of the whole dataframe:
df1=pd.DataFrame(np.random.rand(5,3))

In [27]:
df1

Unnamed: 0,0,1,2
0,0.784297,0.052923,0.396923
1,0.655896,0.47278,0.081623
2,0.722093,0.251789,0.736785
3,0.729717,0.399625,0.853772
4,0.638484,0.25706,0.506322


In [30]:
#get mean of the whole dataframe: #using numpy function in pandas
df1.pipe(np.mean) #column wise

0    0.706098
1    0.286835
2    0.515085
dtype: float64

In [32]:
df1.pipe(np.mean,axis=1) #row wise

0    0.411381
1    0.403433
2    0.570222
3    0.661038
4    0.467289
dtype: float64

In [33]:
#using user defined function in pandas
def addd(a,b):
    c=a+b
    return(c) 

In [47]:
df1.pipe(addd,3) #this will add 3 to all the elements of the df

Unnamed: 0,0,1,2
0,3.784297,3.052923,3.396923
1,3.655896,3.47278,3.081623
2,3.722093,3.251789,3.736785
3,3.729717,3.399625,3.853772
4,3.638484,3.25706,3.506322


### .apply: this is used to apply function only either row wise or column wise and not on the whole df

In [45]:
#suppose you want max of each row
df1.apply(np.max, axis=1)

0    0.784297
1    0.655896
2    0.736785
3    0.853772
4    0.638484
dtype: float64

In [42]:
df1

Unnamed: 0,0,1,2
0,0.784297,0.052923,0.396923
1,0.655896,0.47278,0.081623
2,0.722093,0.251789,0.736785
3,0.729717,0.399625,0.853772
4,0.638484,0.25706,0.506322


### .applymap: It is used to apply function element wise only.

In [48]:
#suppose you want to multiply each element by 100
df1.applymap(lambda x:x*100)

Unnamed: 0,0,1,2
0,78.429669,5.292313,39.692294
1,65.589648,47.27798,8.162289
2,72.209292,25.178875,73.678529
3,72.971725,39.962513,85.377233
4,63.848443,25.705983,50.63215


### .map()

In [57]:
df1[0].map(lambda x:x*100) #we can also use map function when we want to use a function on a specific column.
#column name is used

0    78.429669
1    65.589648
2    72.209292
3    72.971725
4    63.848443
Name: 0, dtype: float64

In [85]:
N=7
df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': [33,55,33,5,6,3,100],
   'y': np.random.rand(N)},index=[2,4,5,6,78,1,45])

In [86]:
df

Unnamed: 0,A,x,y
2,2016-01-01,33,0.817303
4,2016-01-02,55,0.579337
5,2016-01-03,33,0.580488
6,2016-01-04,5,0.767383
78,2016-01-05,6,0.414255
1,2016-01-06,3,0.649097
45,2016-01-07,100,0.2372


## Sorting In Pandas

Sorting can be done using index(labels)or using column name

In [None]:
#using index:  .sort_index()
#using column:  . sort_values()

In [87]:
df.sort_index()

Unnamed: 0,A,x,y
1,2016-01-06,3,0.649097
2,2016-01-01,33,0.817303
4,2016-01-02,55,0.579337
5,2016-01-03,33,0.580488
6,2016-01-04,5,0.767383
45,2016-01-07,100,0.2372
78,2016-01-05,6,0.414255


In [89]:
df.sort_values(by=['x']) #sorting values as per column=X

Unnamed: 0,A,x,y
1,2016-01-06,3,0.649097
6,2016-01-04,5,0.767383
78,2016-01-05,6,0.414255
2,2016-01-01,33,0.817303
5,2016-01-03,33,0.580488
4,2016-01-02,55,0.579337
45,2016-01-07,100,0.2372


In [91]:
df.sort_values(by=['x','y']) #if x will have any same elements then that will be sorted using Y coln.

Unnamed: 0,A,x,y
1,2016-01-06,3,0.649097
6,2016-01-04,5,0.767383
78,2016-01-05,6,0.414255
5,2016-01-03,33,0.580488
2,2016-01-01,33,0.817303
4,2016-01-02,55,0.579337
45,2016-01-07,100,0.2372
