## 1) NumPy Functions

###### Suppose we want to create a random matrix

In [6]:
import numpy as np
np.random.seed(42) #To create reproducible results
matrix = np.random.randn(4,5)
matrix

array([[ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986, -0.23415337],
       [-0.23413696,  1.57921282,  0.76743473, -0.46947439,  0.54256004],
       [-0.46341769, -0.46572975,  0.24196227, -1.91328024, -1.72491783],
       [-0.56228753, -1.01283112,  0.31424733, -0.90802408, -1.4123037 ]])

## 2) Pandas Functions

###### Now suppose we want to convert this to a Pandas Data Frame

In [9]:
import pandas as pd
df = pd.DataFrame(data = matrix,index = ["W","X","Y","Z"], columns = ["A","B","C","D", "E"])
df

Unnamed: 0,A,B,C,D,E
W,0.496714,-0.138264,0.647689,1.52303,-0.234153
X,-0.234137,1.579213,0.767435,-0.469474,0.54256
Y,-0.463418,-0.46573,0.241962,-1.91328,-1.724918
Z,-0.562288,-1.012831,0.314247,-0.908024,-1.412304


###### We can use loc to extract the data using its named-indexs

In [13]:
df.loc["Y"]

A   -0.463418
B   -0.465730
C    0.241962
D   -1.913280
E   -1.724918
Name: Y, dtype: float64

###### In comparision, iloc[ ] takes integer locations or default locations instead of the actual one..

In [14]:
df.iloc[3]

A   -0.562288
B   -1.012831
C    0.314247
D   -0.908024
E   -1.412304
Name: Z, dtype: float64

###### Similarly, if you want to use iloc[ ] or loc [ ] to extract slices of a dataframe, you have to pass a list of row/column indices or names.

In [15]:
df.loc[["W", "X"],["B","C"]]

Unnamed: 0,B,C
W,-0.138264,0.647689
X,1.579213,0.767435


In [22]:
df.iloc[[0,1],[1,2]]

Unnamed: 0,B,C
W,-0.138264,0.647689
X,1.579213,0.767435


##### Let's say you want to select multiple rows automatically instead of typing them in explicitly. 

In [26]:
df.loc["W":"X","B":"C"]

Unnamed: 0,B,C
W,-0.138264,0.647689
X,1.579213,0.767435


In [27]:
df.iloc[0:2,1:3] #notice we don't pass a list this time

Unnamed: 0,B,C
W,-0.138264,0.647689
X,1.579213,0.767435


###### Applying conditions and filtering data using iloc[ ] and loc[ ]

In [40]:
df[df[["A","B","D"]]>0]

Unnamed: 0,A,B,C,D,E
W,0.496714,,,1.52303,
X,,1.579213,,,
Y,,,,,
Z,,,,,


In [36]:
df[df.iloc[0:-1,0:-1]>0] 

Unnamed: 0,A,B,C,D,E
W,0.496714,,0.647689,1.52303,
X,,1.579213,0.767435,,
Y,,,0.241962,,
Z,,,,,


##### We can even add multiple conditions

In [49]:
df.loc[(df["A"] > 0) & (df["B"]<0),]

Unnamed: 0,A,B,C,D,E
W,0.496714,-0.138264,0.647689,1.52303,-0.234153


###### We can also reset index back to default

In [50]:
df.reset_index(drop = True) #Drops the index column

Unnamed: 0,A,B,C,D,E
0,0.496714,-0.138264,0.647689,1.52303,-0.234153
1,-0.234137,1.579213,0.767435,-0.469474,0.54256
2,-0.463418,-0.46573,0.241962,-1.91328,-1.724918
3,-0.562288,-1.012831,0.314247,-0.908024,-1.412304


###### We can also set a list/column as an index

In [64]:
df.set_index(df["split_sent"], drop = True)

Unnamed: 0_level_0,A,B,C,D,E,split_sent
split_sent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I,0.496714,-0.138264,0.647689,1.52303,-0.234153,I
want,-0.234137,1.579213,0.767435,-0.469474,0.54256,want
split,-0.463418,-0.46573,0.241962,-1.91328,-1.724918,split
sentence,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,sentence


###### We can also split sentences based on a value which by default is blank

In [56]:
df["split_sent"] = "I want split sentence".split()

In [55]:
df

Unnamed: 0,A,B,C,D,E,split_sent
W,0.496714,-0.138264,0.647689,1.52303,-0.234153,I
X,-0.234137,1.579213,0.767435,-0.469474,0.54256,want
Y,-0.463418,-0.46573,0.241962,-1.91328,-1.724918,split
Z,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,sentence


##### Multi Hierarchial Indexing

In [79]:
outside = ["G1","G2","G3","G4","G5","G6","G7"]

In [83]:
l1 = ["A","A","A","B","B","B","C","C","C"]
l2 = [1,2,3,1,2,3,1,2,3]
l3 = [1,2,3,4,5,6,7,8,9]

In [84]:
h = list(zip(l1,l2,l3))

In [85]:
multi = pd.MultiIndex.from_tuples(h)

In [88]:
df1 = pd.DataFrame(data = np.random.randn(9,9),index = multi)

In [89]:
df1

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,2,3,4,5,6,7,8
A,1,1,-0.485364,0.081874,2.314659,-1.867265,0.68626,-1.612716,-0.471932,1.088951,0.06428
A,2,2,-1.077745,-0.715304,0.679598,-0.730367,0.216459,0.045572,-0.6516,2.143944,0.633919
A,3,3,-2.025143,0.186454,-0.661786,0.852433,-0.792521,-0.114736,0.504987,0.865755,-1.200296
B,1,4,-0.334501,-0.474945,-0.653329,1.765454,0.404982,-1.260884,0.917862,2.122156,1.032465
B,2,5,-1.51937,-0.484234,1.266911,-0.707669,0.443819,0.774634,-0.92693,-0.059525,-3.241267
B,3,6,-1.024388,-0.252568,-1.247783,1.632411,-1.430141,-0.440044,0.130741,1.441273,-1.435862
C,1,7,1.163164,0.010233,-0.981509,0.462103,0.19906,-0.600217,0.069802,-0.385314,0.113517
C,2,8,0.662131,1.586017,-1.237815,2.133033,-1.952088,-0.151785,0.588317,0.280992,-0.6227
C,3,9,-0.208122,-0.493001,-0.589365,0.849602,0.357015,-0.69291,0.8996,0.3073,0.812862


##### How to access multiple indexes

In [80]:
df1.loc["A"]

Unnamed: 0,Unnamed: 1,A,B,C
1,1,-0.322062,0.813517,-1.230864
2,2,0.22746,1.307143,-1.607483
3,3,0.184634,0.259883,0.781823


###### Dropping NA values using a threshold

In [90]:
# it checks if there are atelast 3 non NAN values, and if not, only then drops the row
df.dropna(thresh = 3) 

Unnamed: 0,A,B,C,D,E,split_sent
W,0.496714,-0.138264,0.647689,1.52303,-0.234153,I
X,-0.234137,1.579213,0.767435,-0.469474,0.54256,want
Y,-0.463418,-0.46573,0.241962,-1.91328,-1.724918,split
Z,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,sentence
