# Quick Overview of pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
dict1 = {
    "name" : ["Samrachana","Sanskriti","Aditi","Princy"],
    "age" : [19,9,3,15],
    "city" : ["Kushma","Baglung","Pokhara","kathmandu"]
    }

In [3]:
# Dataframe is like excel sheet or a tabular form having rows and columns
# IT IS USED FOR ANALYZING BIG DATA EASILY
df=pd.DataFrame(dict1) # This fucntion converts the data into excel sheet format for easier indexing
print(df) # 0 1 2 3 are the index
df

         name  age       city
0  Samrachana   19     Kushma
1   Sanskriti    9    Baglung
2       Aditi    3    Pokhara
3      Princy   15  kathmandu


Unnamed: 0,name,age,city
0,Samrachana,19,Kushma
1,Sanskriti,9,Baglung
2,Aditi,3,Pokhara
3,Princy,15,kathmandu


### Exproting this DataFrame into excelsheet or csv format(comma seperated value)

In [4]:
df.to_csv("Myinfo.csv") 
# this csv file/ excel sheet is created in the same folder, if not present
# 0 1 2 3 index are also present
# df.to_csv() function is used when we need to keep that data in excel sheet after analyzing

In [5]:
# if we don't want the index 0 1 2 3 to appear in excel sheet:
df.to_csv("myinfo.csv", index=False) # overwrites the old file with new data 

In [6]:
df.to_csv("myinfo_no_index.csv", index=False) # creates a new file with index = False

In [7]:
# for millions of data, we can choose if we want to see the first 3 data:
df.head(3)

Unnamed: 0,name,age,city
0,Samrachana,19,Kushma
1,Sanskriti,9,Baglung
2,Aditi,3,Pokhara


In [8]:
# If we want to see last 3/n data:
df.tail(3)

Unnamed: 0,name,age,city
1,Sanskriti,9,Baglung
2,Aditi,3,Pokhara
3,Princy,15,kathmandu


In [9]:
df.describe() # analyzes numerical columns, here in our case of dict1 numerical column is age.
# so it shows count mean std min quartiles of age column

Unnamed: 0,age
count,4.0
mean,11.5
std,7.0
min,3.0
25%,7.5
50%,12.0
75%,16.0
max,19.0


In [10]:
# lets create a csv file in excel and read it here:
train = pd.read_csv("train.csv") # here this train is dataframe

In [11]:
train # dataframe 

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train No.,Speed,City
0,0,0,0,0,0,12322,34,newyork
1,1,1,1,1,1,12534,123,dallas
2,2,2,2,2,2,125654,50,colorado
3,3,3,3,3,3,564523,87,antratica


# note: dont get cofused


In numpy arr[0][1] is like arr[row][column]
But in pandas it is exactly opposite:
df[0][1] is like df[column][row]

Axis meaning is same for both numpy and pandas
Axis = 0 → Down the rows (operate column-wise)
Axis = 1 → Across the columns (operate row-wise)

In [12]:
# Accessing a column:
print(train['Speed'])
print(train['City'])

0     34
1    123
2     50
3     87
Name: Speed, dtype: int64
0      newyork
1       dallas
2     colorado
3    antratica
Name: City, dtype: object


In [13]:
# Accessing a element:
print(train['City'][0])
print(train['Speed'][2])


newyork
50


In [14]:
# changing value:
train['City'][1]="dallas" # it is changed only here not in csv file
train['Speed'][2]=50
train

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['City'][1]="dallas" # it is changed only here not in csv file
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Speed'][2]=50


Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train No.,Speed,City
0,0,0,0,0,0,12322,34,newyork
1,1,1,1,1,1,12534,123,dallas
2,2,2,2,2,2,125654,50,colorado
3,3,3,3,3,3,564523,87,antratica


In [15]:
# Updating the changed train in csv:
train.to_csv("train.csv")

In [16]:
# CHANGING INDEX:
train.index = ['first', 'second', 'third', 'fourth'] # index is row number
train

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train No.,Speed,City
first,0,0,0,0,0,12322,34,newyork
second,1,1,1,1,1,12534,123,dallas
third,2,2,2,2,2,125654,50,colorado
fourth,3,3,3,3,3,564523,87,antratica


In [17]:
train.index = [1,2,3,4]
train

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train No.,Speed,City
1,0,0,0,0,0,12322,34,newyork
2,1,1,1,1,1,12534,123,dallas
3,2,2,2,2,2,125654,50,colorado
4,3,3,3,3,3,564523,87,antratica


In [18]:
train.index = [2,4,8,7]
train

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Train No.,Speed,City
2,0,0,0,0,0,12322,34,newyork
4,1,1,1,1,1,12534,123,dallas
8,2,2,2,2,2,125654,50,colorado
7,3,3,3,3,3,564523,87,antratica


## Let's go into a little depth

In [19]:
# pandas has two data structures: SERIES AND DATAFRAME
# series is Basically n by 1 data structure and data frame is row by column
ser = pd.Series(np.random.rand(10))
ser

0    0.399129
1    0.156092
2    0.127243
3    0.358418
4    0.369931
5    0.060288
6    0.278947
7    0.401211
8    0.768873
9    0.181387
dtype: float64

In [20]:
type(ser) # the type of ser is series

pandas.core.series.Series

In [21]:
# Creating dataframe:
newdf = pd.DataFrame(np.random.rand(334,5))
newdf

Unnamed: 0,0,1,2,3,4
0,0.480374,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [22]:
ndf = pd.DataFrame(np.random.rand(555,5),index=np.arange(555)) # np.arange() is like range() function in normal python
ndf

Unnamed: 0,0,1,2,3,4
0,0.037954,0.111946,0.941278,0.397728,0.190491
1,0.788758,0.361019,0.027159,0.559959,0.054526
2,0.419718,0.246470,0.751345,0.828356,0.592696
3,0.308578,0.496942,0.593996,0.773060,0.484526
4,0.416086,0.587381,0.964515,0.257910,0.175101
...,...,...,...,...,...
550,0.632861,0.468938,0.519133,0.196919,0.938928
551,0.583462,0.298191,0.517715,0.481267,0.479785
552,0.023398,0.531310,0.853470,0.356463,0.946049
553,0.175698,0.609051,0.588044,0.461647,0.326873


In [23]:
newdf.head() # in case of large data- it gives top 5 rows

Unnamed: 0,0,1,2,3,4
0,0.480374,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.42272,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621


In [24]:
type(newdf)

pandas.core.frame.DataFrame

In [25]:
newdf.describe()

Unnamed: 0,0,1,2,3,4
count,334.0,334.0,334.0,334.0,334.0
mean,0.482721,0.490511,0.495757,0.501921,0.461111
std,0.300563,0.299685,0.289799,0.2973,0.28244
min,0.000257,0.001694,0.001839,0.001081,0.006352
25%,0.212462,0.228467,0.251482,0.252216,0.20849
50%,0.477696,0.470479,0.515329,0.480071,0.453779
75%,0.736097,0.751086,0.723389,0.764911,0.684916
max,0.997439,0.993877,0.999911,0.99637,0.998431


In [26]:
newdf.dtypes # gives data type of  each column as one column has only one data type, eg 0 column can have onle one data type 1 column has its own data types

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [27]:
newdf[0][0]="sam" # since 0 0 value is change to string so the datatype for the column 0 is not changed to object as it cant be float anymore, Even if only one element in a column is changed the data type for al element is object as the column should have same datatype
newdf.dtypes

  newdf[0][0]="sam" # since 0 0 value is change to string so the datatype for the column 0 is not changed to object as it cant be float anymore, Even if only one element in a column is changed the data type for al element is object as the column should have same datatype


0     object
1    float64
2    float64
3    float64
4    float64
dtype: object

In [28]:
newdf.head() # 00 element is now changed

Unnamed: 0,0,1,2,3,4
0,sam,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.42272,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621


In [29]:
print(newdf.index) # start stop and step is shown as we have not defined index here while declaring
print(ndf.index) # list of all index is shown here as we have defined index while decalring it

RangeIndex(start=0, stop=334, step=1)
Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       545, 546, 547, 548, 549, 550, 551, 552, 553, 554],
      dtype='int64', length=555)


In [30]:
print(newdf.columns)
print(ndf.columns)

RangeIndex(start=0, stop=5, step=1)
RangeIndex(start=0, stop=5, step=1)


In [31]:
# changing dataframes into numpy arrays:
newdf.to_numpy

<bound method DataFrame.to_numpy of             0         1         2         3         4
0         sam  0.141475  0.888703  0.945092  0.824552
1    0.830957  0.422720  0.268281  0.628621  0.458415
2    0.288398  0.893982  0.216602  0.518299  0.091842
3    0.426494  0.162149  0.509008  0.979029  0.077035
4    0.390635  0.872391  0.908207  0.823767  0.854621
..        ...       ...       ...       ...       ...
329  0.651975  0.304359  0.271861  0.209873  0.574948
330  0.550315  0.838268  0.289179  0.172441  0.908953
331  0.031237  0.052889  0.697722  0.228119  0.559828
332  0.582019  0.385234  0.035733  0.486498  0.434017
333  0.912595  0.662131  0.749724  0.727058  0.006352

[334 rows x 5 columns]>

In [32]:
newdf.T # Transpose row is converted into column and column is converted into row

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,324,325,326,327,328,329,330,331,332,333
0,sam,0.830957,0.288398,0.426494,0.390635,0.485644,0.248694,0.967835,0.41501,0.656628,...,0.479453,0.592693,0.869145,0.043693,0.59493,0.651975,0.550315,0.031237,0.582019,0.912595
1,0.141475,0.42272,0.893982,0.162149,0.872391,0.855643,0.214313,0.515399,0.782747,0.286574,...,0.769628,0.593211,0.913025,0.030369,0.17411,0.304359,0.838268,0.052889,0.385234,0.662131
2,0.888703,0.268281,0.216602,0.509008,0.908207,0.558072,0.109033,0.249822,0.728799,0.444757,...,0.693376,0.672165,0.510629,0.694023,0.167678,0.271861,0.289179,0.697722,0.035733,0.749724
3,0.945092,0.628621,0.518299,0.979029,0.823767,0.081032,0.386559,0.895316,0.170018,0.352956,...,0.657559,0.004964,0.553597,0.867195,0.590523,0.209873,0.172441,0.228119,0.486498,0.727058
4,0.824552,0.458415,0.091842,0.077035,0.854621,0.356075,0.963308,0.658033,0.508026,0.030851,...,0.135856,0.705407,0.368312,0.437171,0.567516,0.574948,0.908953,0.559828,0.434017,0.006352


In [33]:
# Sorting data frame's index in descending order
newdf.sort_index(axis = 0, ascending = False) #axis=0: Operates along rows (vertically) and axis=1: Operates along columns (horizontally)

Unnamed: 0,0,1,2,3,4
333,0.912595,0.662131,0.749724,0.727058,0.006352
332,0.582019,0.385234,0.035733,0.486498,0.434017
331,0.031237,0.052889,0.697722,0.228119,0.559828
330,0.550315,0.838268,0.289179,0.172441,0.908953
329,0.651975,0.304359,0.271861,0.209873,0.574948
...,...,...,...,...,...
4,0.390635,0.872391,0.908207,0.823767,0.854621
3,0.426494,0.162149,0.509008,0.979029,0.077035
2,0.288398,0.893982,0.216602,0.518299,0.091842
1,0.830957,0.422720,0.268281,0.628621,0.458415


In [34]:
newdf.sort_index(axis = 0) #this is function for sorting in ascending axis=0- no change as already sorted in ascending

Unnamed: 0,0,1,2,3,4
0,sam,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [35]:
newdf.sort_index(axis = 1, ascending = False)

Unnamed: 0,4,3,2,1,0
0,0.824552,0.945092,0.888703,0.141475,sam
1,0.458415,0.628621,0.268281,0.422720,0.830957
2,0.091842,0.518299,0.216602,0.893982,0.288398
3,0.077035,0.979029,0.509008,0.162149,0.426494
4,0.854621,0.823767,0.908207,0.872391,0.390635
...,...,...,...,...,...
329,0.574948,0.209873,0.271861,0.304359,0.651975
330,0.908953,0.172441,0.289179,0.838268,0.550315
331,0.559828,0.228119,0.697722,0.052889,0.031237
332,0.434017,0.486498,0.035733,0.385234,0.582019


In [36]:
newdf.sort_index(axis = 1) # function for sorting in ascending order for axis=1

Unnamed: 0,0,1,2,3,4
0,sam,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [37]:
newdf[0] # this is a series in the dataframe newdf

0           sam
1      0.830957
2      0.288398
3      0.426494
4      0.390635
         ...   
329    0.651975
330    0.550315
331    0.031237
332    0.582019
333    0.912595
Name: 0, Length: 334, dtype: object

In [38]:
newdf.head()

Unnamed: 0,0,1,2,3,4
0,sam,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.42272,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621


### Careful while copying dataframes

In [39]:
newdf2=newdf #newdf2 is view of newdf
# this is like newdf2 and newdf are pointing to the same data in the same memory address so the change in one also makes the changes in other
newdf2 # prints all same as newdf no problem

Unnamed: 0,0,1,2,3,4
0,sam,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [40]:
newdf2[0][0]=343
newdf2 # 00 lement changed no problem

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf2[0][0]=343


Unnamed: 0,0,1,2,3,4
0,343,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [41]:
newdf # but while changing newdf2 that also changed the value of newdf so we need to be careful while copying
#This is hte main problem while doing newdf2=newdf

Unnamed: 0,0,1,2,3,4
0,343,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


#### CORRECT WAY TO COPY 

In [42]:
newdf2 = newdf.copy()

In [43]:
newdf2[0][1] = 456
newdf2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdf2[0][1] = 456


Unnamed: 0,0,1,2,3,4
0,343,0.141475,0.888703,0.945092,0.824552
1,456,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [44]:
newdf # no change in newdf 

Unnamed: 0,0,1,2,3,4
0,343,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [45]:
# Generally we get warning while changing the particular value in df like this:
# newdf[0][0]=456 
# here warning is given because pythone cant not decide if this is view or copy 
# so the right way to change the particularvalues is no use loc:

In [46]:
newdf.loc[0,0]=654 # right way 
# NOTE: here loc[row,column]
newdf

Unnamed: 0,0,1,2,3,4
0,654,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [47]:
# Changing column names:
newdf.columns = list("ABCDE")
newdf.head(5)

Unnamed: 0,A,B,C,D,E
0,654.0,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.42272,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621


In [48]:
newdf.loc[[1,2],["C","D"]] # accessing row = 1 and 2 with column =  C and D

Unnamed: 0,C,D
1,0.268281,0.628621
2,0.216602,0.518299


In [49]:
newdf.loc[:,['D','E']] # all rows column=d and e

Unnamed: 0,D,E
0,0.945092,0.824552
1,0.628621,0.458415
2,0.518299,0.091842
3,0.979029,0.077035
4,0.823767,0.854621
...,...,...
329,0.209873,0.574948
330,0.172441,0.908953
331,0.228119,0.559828
332,0.486498,0.434017


In [50]:
newdf.loc[[5,6,7],:] # row = 5 6 7 and column all

Unnamed: 0,A,B,C,D,E
5,0.485644,0.855643,0.558072,0.081032,0.356075
6,0.248694,0.214313,0.109033,0.386559,0.963308
7,0.967835,0.515399,0.249822,0.895316,0.658033


In [51]:
newdf.loc[(newdf['A']<0.3)] # newdf[0] is newdf[column]
# gives only those rows where newdf["A"]'s values are less then 0.3

Unnamed: 0,A,B,C,D,E
2,0.288398,0.893982,0.216602,0.518299,0.091842
6,0.248694,0.214313,0.109033,0.386559,0.963308
11,0.005646,0.090874,0.065660,0.627434,0.220325
15,0.11775,0.161769,0.767698,0.234389,0.108504
18,0.26555,0.682694,0.793671,0.986225,0.665144
...,...,...,...,...,...
317,0.04663,0.571676,0.777321,0.869329,0.201843
318,0.136077,0.484120,0.805023,0.202506,0.886008
323,0.207992,0.612088,0.046651,0.993729,0.504904
327,0.043693,0.030369,0.694023,0.867195,0.437171


In [52]:
newdf.loc[(newdf['A']<0.3) & (newdf['C']>0.3)] # gives those rows where both condition matches

Unnamed: 0,A,B,C,D,E
15,0.11775,0.161769,0.767698,0.234389,0.108504
18,0.26555,0.682694,0.793671,0.986225,0.665144
26,0.242748,0.283178,0.613325,0.079993,0.529756
39,0.076706,0.673812,0.485753,0.477322,0.800037
41,0.262018,0.990842,0.654503,0.992190,0.635174
...,...,...,...,...,...
312,0.031037,0.890222,0.943880,0.761686,0.111652
317,0.04663,0.571676,0.777321,0.869329,0.201843
318,0.136077,0.484120,0.805023,0.202506,0.886008
327,0.043693,0.030369,0.694023,0.867195,0.437171


In [53]:
newdf.iloc[0,4] # it is same as newdf.loc[0,"E"]
# if we have to access using actual row column name use loc 
# but if we want to accesss using numbers/index no matter what are the row column names we use ioc

np.float64(0.8245518407939422)

In [54]:
newdf.iloc[[1,2],[2,3]] # same is newdf.loc[[1,2],['C','D']]

Unnamed: 0,C,D
1,0.268281,0.628621
2,0.216602,0.518299


In [55]:
newdf.head(5)

Unnamed: 0,A,B,C,D,E
0,654.0,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.42272,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621


In [56]:
# Deleting a row:
newdf.drop([2]) # deletes row = 2

Unnamed: 0,A,B,C,D,E
0,654,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621
5,0.485644,0.855643,0.558072,0.081032,0.356075
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [57]:
newdf.drop([3,4]) # by default axis = 0 so deletes row 7 and 8

Unnamed: 0,A,B,C,D,E
0,654,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.422720,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
5,0.485644,0.855643,0.558072,0.081032,0.356075
6,0.248694,0.214313,0.109033,0.386559,0.963308
...,...,...,...,...,...
329,0.651975,0.304359,0.271861,0.209873,0.574948
330,0.550315,0.838268,0.289179,0.172441,0.908953
331,0.031237,0.052889,0.697722,0.228119,0.559828
332,0.582019,0.385234,0.035733,0.486498,0.434017


In [58]:
# Deleting a column
newdf.drop(['B'], axis = 1) # deletes column = B by default axis = 0

Unnamed: 0,A,C,D,E
0,654,0.888703,0.945092,0.824552
1,0.830957,0.268281,0.628621,0.458415
2,0.288398,0.216602,0.518299,0.091842
3,0.426494,0.509008,0.979029,0.077035
4,0.390635,0.908207,0.823767,0.854621
...,...,...,...,...
329,0.651975,0.271861,0.209873,0.574948
330,0.550315,0.289179,0.172441,0.908953
331,0.031237,0.697722,0.228119,0.559828
332,0.582019,0.035733,0.486498,0.434017


In [59]:
newdf.drop(['C','D'], axis = 1) # deletes column c and d

Unnamed: 0,A,B,E
0,654,0.141475,0.824552
1,0.830957,0.422720,0.458415
2,0.288398,0.893982,0.091842
3,0.426494,0.162149,0.077035
4,0.390635,0.872391,0.854621
...,...,...,...
329,0.651975,0.304359,0.574948
330,0.550315,0.838268,0.908953
331,0.031237,0.052889,0.559828
332,0.582019,0.385234,0.434017


In [60]:
newdf.head(5)

Unnamed: 0,A,B,C,D,E
0,654.0,0.141475,0.888703,0.945092,0.824552
1,0.830957,0.42272,0.268281,0.628621,0.458415
2,0.288398,0.893982,0.216602,0.518299,0.091842
3,0.426494,0.162149,0.509008,0.979029,0.077035
4,0.390635,0.872391,0.908207,0.823767,0.854621


In [61]:
newdf.drop(['C','D'], axis = 1, inplace = True) # if we do inplace = True then it will make the actual change in original data frame
# therefore, doing newdf = newdf.drop(['C','D'], axisa = 1) and newdf.drop(['C','D'], axis = 1, inplace = True) both will make change in original dataframe 
newdf.head(5)

Unnamed: 0,A,B,E
0,654.0,0.141475,0.824552
1,0.830957,0.42272,0.458415
2,0.288398,0.893982,0.091842
3,0.426494,0.162149,0.077035
4,0.390635,0.872391,0.854621


In [63]:
newdf.drop([1,2], axis = 0, inplace = True)

In [64]:
newdf.head(5)

Unnamed: 0,A,B,E
0,654.0,0.141475,0.824552
3,0.426494,0.162149,0.077035
4,0.390635,0.872391,0.854621
5,0.485644,0.855643,0.356075
6,0.248694,0.214313,0.963308


In [65]:
newdf.reset_index() # changes the changes made by inplace and resets index(row name) of dataframe 
# but it adds one extra column named index 

Unnamed: 0,index,A,B,E
0,0,654,0.141475,0.824552
1,3,0.426494,0.162149,0.077035
2,4,0.390635,0.872391,0.854621
3,5,0.485644,0.855643,0.356075
4,6,0.248694,0.214313,0.963308
...,...,...,...,...
327,329,0.651975,0.304359,0.574948
328,330,0.550315,0.838268,0.908953
329,331,0.031237,0.052889,0.559828
330,332,0.582019,0.385234,0.434017


In [66]:
# removing the extra column named index:
newdf.reset_index(drop=True,inplace=True)
newdf.head(3)

Unnamed: 0,A,B,E
0,654.0,0.141475,0.824552
1,0.426494,0.162149,0.077035
2,0.390635,0.872391,0.854621


In [70]:
newdf['B'].isnull() # checks in the column B if there is any NaN/ null or 0 values and return boolean

0      False
1      False
2      False
3      False
4      False
       ...  
327    False
328    False
329    False
330    False
331    False
Name: B, Length: 332, dtype: bool

In [88]:
# Always use loc or iloc while assigning or accessing  data
newdf.loc[:,["B"]] = None # this means pandas internally converts all values to NaN or ull
newdf.loc[:,['C']] = 0 # but for 0, pandas converts all values to the integer 0 not null so while checking newdf['C'].isnull(), we will get all False as this 0 is integer not null
newdf

Unnamed: 0,A,B,E,C
0,654,,0.824552,0
1,0.426494,,0.077035,0
2,0.390635,,0.854621,0
3,0.485644,,0.356075,0
4,0.248694,,0.963308,0
...,...,...,...,...
327,0.651975,,0.574948,0
328,0.550315,,0.908953,0
329,0.031237,,0.559828,0
330,0.582019,,0.434017,0


In [89]:
newdf['B'].isnull()

0      True
1      True
2      True
3      True
4      True
       ... 
327    True
328    True
329    True
330    True
331    True
Name: B, Length: 332, dtype: bool

In [90]:
newdf['C'].isnull() #  All are False as 0 is not null, 0 is integer so all values are assigneed to 0

0      False
1      False
2      False
3      False
4      False
       ...  
327    False
328    False
329    False
330    False
331    False
Name: C, Length: 332, dtype: bool

In [117]:
df=pd.DataFrame(
    {
        "name": ['Alfred','Batman','Alfred'],
        'toy':[np.nan,'bike','car'], # nan means bissing value 
        'born':[pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT] # if we write "2024-01-01", then it is string not a date so for creating date object in python or pandsas we use the function pd.Timestamp(""2024-01-01")
    }# nat means missing time
)
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,bike,1940-04-25
2,Alfred,car,NaT


In [118]:
df.dropna() # drops all the nan and nat

Unnamed: 0,name,toy,born
1,Batman,bike,1940-04-25


In [119]:
df.dropna(how='all') # drops all the nan and nat only if all are nan for one key/column

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,bike,1940-04-25
2,Alfred,car,NaT


In [120]:
dff=pd.DataFrame(
    {
        "name": ['Alfred','Batman','Catman'],
        'toy':[np.nan,np.nan,np.nan], # nan means bissing value 
        'born':[pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT] # if we write "2024-01-01", then it is string not a date so for creating date object in python or pandsas we use the function pd.Timestamp(""2024-01-01")
    }# nat means missing time
)
dff

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,,1940-04-25
2,Catman,,NaT


In [121]:
dff.dropna(how='all', axis = 1) # removes the column only if it's all values nan 

Unnamed: 0,name,born
0,Alfred,NaT
1,Batman,1940-04-25
2,Catman,NaT


In [122]:
dff.head()

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,,1940-04-25
2,Catman,,NaT


In [123]:
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,bike,1940-04-25
2,Alfred,car,NaT


In [130]:
df.drop_duplicates(subset = ['name']) # has keep = 'first' by default so removes the value that comes at last 
#drops the duplicate  from last by default and keeps the first value, subset=["name"] is syntax that is necessary for drop_duplicates

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,bike,1940-04-25


In [132]:
df.drop_duplicates(subset = ['name'], keep='first') # same no change as keep = 'first' is by default

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,bike,1940-04-25


In [134]:
df.drop_duplicates(subset = ['name'], keep='last')# keeps the last value and removes the first one

Unnamed: 0,name,toy,born
1,Batman,bike,1940-04-25
2,Alfred,car,NaT


In [135]:
dff=pd.DataFrame(
    {
        "name": ['Alfred','Alfred','Alfred'],
        'toy':[np.nan,np.nan,np.nan], # nan means bissing value 
        'born':[pd.NaT, pd.Timestamp("1940-04-25"),pd.NaT] # if we write "2024-01-01", then it is string not a date so for creating date object in python or pandsas we use the function pd.Timestamp(""2024-01-01")
    }# nat means missing time
)
dff

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Alfred,,1940-04-25
2,Alfred,,NaT


In [137]:
dff.drop_duplicates(subset=['name']) # keeps the first and removes all duplicates

Unnamed: 0,name,toy,born
0,Alfred,,NaT


In [139]:
dff.drop_duplicates(subset=['name'], keep = 'last') # keeps last and removes all duplicates

Unnamed: 0,name,toy,born
2,Alfred,,NaT


In [143]:
df.shape

(3, 3)

In [144]:
newdf.shape

(332, 4)

In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   name    3 non-null      object        
 1   toy     2 non-null      object        
 2   born    1 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 204.0+ bytes


In [150]:
df.describe()

Unnamed: 0,born
count,1
mean,1940-04-25 00:00:00
min,1940-04-25 00:00:00
25%,1940-04-25 00:00:00
50%,1940-04-25 00:00:00
75%,1940-04-25 00:00:00
max,1940-04-25 00:00:00


In [157]:
#  Count of Unique Values
df['name'].value_counts(dropna=False) # dropna=false means dont remove nan in count

name
Alfred    2
Batman    1
Name: count, dtype: int64

In [158]:
df['toy'].value_counts(dropna=False) # dropna=false means dont remove nan in count

toy
NaN     1
bike    1
car     1
Name: count, dtype: int64

In [159]:
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,bike,1940-04-25
2,Alfred,car,NaT


In [160]:
df['born'].value_counts(dropna=False)# dropna=false means dont remove nan in count

born
NaT           2
1940-04-25    1
Name: count, dtype: int64

In [162]:
df['name'].value_counts(dropna=True) # dropna=true means remove nan in count

name
Alfred    2
Batman    1
Name: count, dtype: int64

In [164]:
df['toy'].value_counts(dropna=True) 

toy
bike    1
car     1
Name: count, dtype: int64

In [166]:
df['born'].value_counts(dropna=True)  # dropna = True means remove nan/ nat in count

born
1940-04-25    1
Name: count, dtype: int64

In [171]:
df.isnull() # gives true where there is nan / nat and false where there is not null

Unnamed: 0,name,toy,born
0,False,True,True
1,False,False,False
2,False,False,True


In [170]:
df.notnull() # gives true where there is not null and false where there is null

Unnamed: 0,name,toy,born
0,True,False,False
1,True,True,True
2,True,True,False
