# Pandas
- Library for Data Analysis and Manipulation.

# Why Pandas
- Provides ability to work with Tabular Data.
- Tabular Data - data that is organized into tables having Rows and Columns.

In [1]:
### Installing Pandas
! pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd

# Series
- Series is a 1-D Labelled Array that can hold/store array.

In [3]:
book_title = ["C++", "Java", "Python", "JavaScript"]

In [4]:
book_title= pd.Series(book_title)
print(book_title)

0           C++
1          Java
2        Python
3    JavaScript
dtype: object


In [5]:
type(book_title)

pandas.core.series.Series

In [6]:
book_title[2]

'Python'

In [7]:
book_title.index = ['a', 'b', 'c', 'd']

In [8]:
book_title

a           C++
b          Java
c        Python
d    JavaScript
dtype: object

In [9]:
book_title['c']

'Python'

# DataFrames
- 2-D Labelled Array having index and coulmns.
- multiple series combined together to form a Dataframe.
- most widely used Data Structure in Pandas.

In [10]:
import numpy as np

In [11]:
arr = np.random.randint(10, 100, size = (6, 4))
arr

array([[72, 74, 73, 82],
       [62, 19, 72, 62],
       [10, 66, 23, 20],
       [52, 15, 56, 85],
       [32, 16, 71, 48],
       [13, 81, 76, 54]])

In [12]:
df = pd.DataFrame(data=arr)
df

Unnamed: 0,0,1,2,3
0,72,74,73,82
1,62,19,72,62
2,10,66,23,20
3,52,15,56,85
4,32,16,71,48
5,13,81,76,54


In [13]:
type(df)

pandas.core.frame.DataFrame

In [14]:
df[2]

0    73
1    72
2    23
3    56
4    71
5    76
Name: 2, dtype: int64

In [15]:
type(df[2])

pandas.core.series.Series

In [16]:
df.columns = ["A", "B", "C", "D"]
df

Unnamed: 0,A,B,C,D
0,72,74,73,82
1,62,19,72,62
2,10,66,23,20
3,52,15,56,85
4,32,16,71,48
5,13,81,76,54


In [22]:
type(df['D'])

pandas.core.series.Series

In [18]:
df.shape

(6, 4)

In [19]:
# head will show first 5 Row by default
df.head()

Unnamed: 0,A,B,C,D
0,72,74,73,82
1,62,19,72,62
2,10,66,23,20
3,52,15,56,85
4,32,16,71,48


In [20]:
df.head(n = 2)

Unnamed: 0,A,B,C,D
0,72,74,73,82
1,62,19,72,62


In [21]:
# tail will show last row by default
df.tail(n=2)

Unnamed: 0,A,B,C,D
4,32,16,71,48
5,13,81,76,54


In [30]:
# Extracting Columns
# Single col
df['C']

0    20
1    11
2    63
3    81
4    71
5    68
Name: C, dtype: int64

In [32]:
# Multiple Col
# cols = ["A", "B"]
# df[cols]
df[["A","B"]]

Unnamed: 0,A,B
0,28,41
1,62,23
2,44,11
3,65,87
4,54,41
5,44,86


In [33]:
# Different Sequence
df[["B", "C", "D"]]

Unnamed: 0,B,C,D
0,41,20,59
1,23,11,30
2,11,63,29
3,87,81,91
4,41,71,47
5,86,68,68


In [35]:
# Add New Column
df["A+B"] = df["A"] + df["B"]
df

Unnamed: 0,A,B,C,D,A+B
0,28,41,20,59,69
1,62,23,11,30,85
2,44,11,63,29,55
3,65,87,81,91,152
4,54,41,71,47,95
5,44,86,68,68,130


In [61]:
df["A-B"] = df["A"] * df["B"]

In [62]:
df

Unnamed: 0,A,B,C,D,A+B,A-B
0,28,41,20,59,69,1148
1,62,23,11,30,85,1426
2,44,11,63,29,55,484
3,65,87,81,91,152,5655
4,54,41,71,47,95,2214
5,44,86,68,68,130,3784


In [56]:
# Delete From DataFrame
df.drop(columns=["A-B"])

Unnamed: 0,A,B,C,D,A+B
0,28,41,20,59,69
1,62,23,11,30,85
2,44,11,63,29,55
3,65,87,81,91,152
4,54,41,71,47,95
5,44,86,68,68,130


In [57]:
# Check df => A-B is still there bcz drop function is modifying and giving new dataframe.
df

Unnamed: 0,A,B,C,D,A+B,A-B
0,28,41,20,59,69,1148
1,62,23,11,30,85,1426
2,44,11,63,29,55,484
3,65,87,81,91,152,5655
4,54,41,71,47,95,2214
5,44,86,68,68,130,3784


In [64]:
# one method is => store it in new variable.
# df = df.drop(columns=["A-B"])
# second method is => set inplace attribute = True this method will modify the same dataframe
df.drop(columns=["A-B"], inplace=True)

In [66]:
df

Unnamed: 0,A,B,C,D,A+B
0,28,41,20,59,69
1,62,23,11,30,85
2,44,11,63,29,55
3,65,87,81,91,152
4,54,41,71,47,95
5,44,86,68,68,130


<h2>Indexing/Extracting Data</h2>

In [71]:
# We hardly change indeces but we will change in this case
df.index = "p q r s t u".split()

In [72]:
df

Unnamed: 0,A,B,C,D,A+B
p,28,41,20,59,69
q,62,23,11,30,85
r,44,11,63,29,55
s,65,87,81,91,152
t,54,41,71,47,95
u,44,86,68,68,130


In [75]:
# Property called loc - location
# Extracting Row
df.loc["p"]

A      28
B      41
C      20
D      59
A+B    69
Name: p, dtype: int64

In [76]:
# iloc = integer location
# Doesn't Matter what type indeces we have, we just have to provide number to acess
df.iloc[0]

A      28
B      41
C      20
D      59
A+B    69
Name: p, dtype: int64

In [78]:
df[2: 5]

Unnamed: 0,A,B,C,D,A+B
r,44,11,63,29,55
s,65,87,81,91,152
t,54,41,71,47,95


In [79]:
df[2: 5][["A", "B"]]

Unnamed: 0,A,B
r,44,11
s,65,87
t,54,41


In [80]:
df.iloc[-2:][["D", "A+B"]]

Unnamed: 0,D,A+B
t,47,95
u,68,130


In [81]:
df.iloc[-2:, -2:]

Unnamed: 0,D,A+B
t,47,95
u,68,130


In [85]:
df.iloc[-1:,-1:]

Unnamed: 0,A+B
u,130


# Masking And Boolean Indexing

In [86]:
df

Unnamed: 0,A,B,C,D,A+B
p,28,41,20,59,69
q,62,23,11,30,85
r,44,11,63,29,55
s,65,87,81,91,152
t,54,41,71,47,95
u,44,86,68,68,130


In [88]:
# Masking
mask = df > 30
mask

Unnamed: 0,A,B,C,D,A+B
p,False,True,False,True,True
q,True,False,False,False,True
r,True,False,True,False,True
s,True,True,True,True,True
t,True,True,True,True,True
u,True,True,True,True,True


In [90]:
# df[df>30]
df[mask]

Unnamed: 0,A,B,C,D,A+B
p,,41.0,,59.0,69
q,62.0,,,,85
r,44.0,,63.0,,55
s,65.0,87.0,81.0,91.0,152
t,54.0,41.0,71.0,47.0,95
u,44.0,86.0,68.0,68.0,130


In [92]:
mask = df["B"] > 40
mask

p     True
q    False
r    False
s     True
t     True
u     True
Name: B, dtype: bool

In [93]:
# get all the rows which are true at "B"
df[mask]

Unnamed: 0,A,B,C,D,A+B
p,28,41,20,59,69
s,65,87,81,91,152
t,54,41,71,47,95
u,44,86,68,68,130


In [94]:
# Extracting rows where B column has value > 40
df[ df["B"] > 40 ]

Unnamed: 0,A,B,C,D,A+B
p,28,41,20,59,69
s,65,87,81,91,152
t,54,41,71,47,95
u,44,86,68,68,130


In [95]:
# Extracting values from Col C and Col D where B column has value > 40
df[ df["B"] > 40 ][["B", "C"]]

Unnamed: 0,B,C
p,41,20
s,87,81
t,41,71
u,86,68


<h2>And , OR , NOT </h2>

In [99]:
df["A"] > 40

p    False
q     True
r     True
s     True
t     True
u     True
Name: A, dtype: bool

In [100]:
df["D"] > 40

p     True
q    False
r    False
s     True
t     True
u     True
Name: D, dtype: bool

In [101]:
(df["A"] > 40) & (df["D"] > 40)

p    False
q    False
r    False
s     True
t     True
u     True
dtype: bool

In [110]:
# Extract Row which satisfy this condition
df[(df["A"] > 40) & (df["D"] > 40)]

Unnamed: 0,A,B,C,D,A+B
s,65,87,81,91,152
t,54,41,71,47,95
u,44,86,68,68,130


In [112]:
# Convert DataFrame Into Numpy Array
df_array = df.values
df_array

array([[ 28,  41,  20,  59,  69],
       [ 62,  23,  11,  30,  85],
       [ 44,  11,  63,  29,  55],
       [ 65,  87,  81,  91, 152],
       [ 54,  41,  71,  47,  95],
       [ 44,  86,  68,  68, 130]])

# Iris DataSet Introduction

In [113]:
# .csv files => comma separated file

In [114]:
!ls

 iris.csv				      Numpy.ipynb
 Jupyter-overview.ipynb			      Pandas.ipynb
'Machine Learning A-Z (Codes and Datasets)'


In [116]:
iris = pd.read_csv("./iris.csv")

In [117]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [118]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [119]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [121]:
# total of 150 data points,
# columns are 5
iris.shape

(150, 5)

In [120]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [122]:
iris.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [123]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [125]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [130]:
# how many different types of flower
iris["species"].nunique()

3

In [131]:
# three different types of flower
iris["species"].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [133]:
# How Many setosa flowers are there?
iris[ iris['species'] == "setosa"].shape

(50, 5)

In [135]:
# Alternative
iris['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [136]:
iris['species'].value_counts()['versicolor']

50

In [137]:
iris['sepal_length'].sum()

876.5

In [138]:
iris['sepal_length'].min()

4.3

In [139]:
iris.sort_values(by='sepal_length')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
42,4.4,3.2,1.3,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
...,...,...,...,...,...
122,7.7,2.8,6.7,2.0,virginica
118,7.7,2.6,6.9,2.3,virginica
117,7.7,3.8,6.7,2.2,virginica
135,7.7,3.0,6.1,2.3,virginica


In [140]:
# if sepal_length is same then check sepal_width 
iris.sort_values(by=['sepal_length', 'sepal_width'])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,4.3,3.0,1.1,0.1,setosa
8,4.4,2.9,1.4,0.2,setosa
38,4.4,3.0,1.3,0.2,setosa
42,4.4,3.2,1.3,0.2,setosa
41,4.5,2.3,1.3,0.3,setosa
...,...,...,...,...,...
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica
117,7.7,3.8,6.7,2.2,virginica


<h3>Grouping Data</h3>

In [143]:
# apply function will perform the function to every row
iris['species'].apply(len)

0      6
1      6
2      6
3      6
4      6
      ..
145    9
146    9
147    9
148    9
149    9
Name: species, Length: 150, dtype: int64

In [146]:
iris.apply( lambda x : x + x)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,10.2,7.0,2.8,0.4,setosasetosa
1,9.8,6.0,2.8,0.4,setosasetosa
2,9.4,6.4,2.6,0.4,setosasetosa
3,9.2,6.2,3.0,0.4,setosasetosa
4,10.0,7.2,2.8,0.4,setosasetosa
...,...,...,...,...,...
145,13.4,6.0,10.4,4.6,virginicavirginica
146,12.6,5.0,10.0,3.8,virginicavirginica
147,13.0,6.0,10.4,4.0,virginicavirginica
148,12.4,6.8,10.8,4.6,virginicavirginica


## Grouping Data Together

In [148]:
iris.aggregate('min')

sepal_length       4.3
sepal_width        2.0
petal_length       1.0
petal_width        0.1
species         setosa
dtype: object

In [150]:
iris.aggregate(['min','max','mean','median'])

  iris.aggregate(['min','max','mean','median'])


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
min,4.3,2.0,1.0,0.1,setosa
max,7.9,4.4,6.9,2.5,virginica
mean,5.843333,3.054,3.758667,1.198667,
median,5.8,3.0,4.35,1.3,


In [152]:
groupby = iris.groupby('species')
groupby

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe93591faf0>

In [153]:
groupby.min()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,4.3,2.3,1.0,0.1
versicolor,4.9,2.0,3.0,1.0
virginica,4.9,2.2,4.5,1.4


In [154]:
groupby.count()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,50,50,50,50
versicolor,50,50,50,50
virginica,50,50,50,50


In [158]:
# Take Transpose to see all data
groupby.describe()

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,...,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.418,...,1.575,1.9,50.0,0.244,0.10721,0.1,0.2,0.2,0.3,0.6
versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


# Handling Missing Data
- dropna()
- fillna()

In [240]:
import warnings
warnings.filterwarnings('ignore')

In [340]:
iris = pd.read_csv("./iris.csv")

In [341]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [342]:
# we are explicitly create NaN values then we will deal with it just for example
nan_idx = np.random.randint(0, 150, 20)
iris['sepal_length'][nan_idx] = np.nan

In [343]:
iris['sepal_length']

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    NaN
146    6.3
147    6.5
148    NaN
149    5.9
Name: sepal_length, Length: 150, dtype: float64

In [344]:
# we are explicitly create NaN values then we will deal with it just for example
nan_idx = np.random.randint(0, 150, 15)
iris['petal_length'][nan_idx] = np.nan

In [345]:
iris['petal_length']

0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: petal_length, Length: 150, dtype: float64

In [346]:
# Now we Have to treat these NaN values 
# 1. delete
# 2. add another value (approximation)

In [347]:
# To Check NaN value => isna, we get a Mask Value
iris.isna().sum()

sepal_length    20
sepal_width      0
petal_length    14
petal_width      0
species          0
dtype: int64

In [339]:
# Drop all rows where values is NaN
###### iris.dropna()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
...,...,...,...,...,...
143,6.8,3.2,5.9,2.3,virginica
144,6.7,3.3,5.7,2.5,virginica
145,6.7,3.0,5.2,2.3,virginica
147,6.5,3.0,5.2,2.0,virginica


In [349]:
# To fill the values where NaN
iris['sepal_length'].fillna(value='FILLTHIS')

0           5.1
1           4.9
2           4.7
3           4.6
4           5.0
         ...   
145    FILLTHIS
146         6.3
147         6.5
148    FILLTHIS
149         5.9
Name: sepal_length, Length: 150, dtype: object

In [354]:
iris['sepal_length'] = iris['sepal_length'].fillna(value=round(iris['sepal_length'].mean(), 1))

In [352]:
round(iris['sepal_length'].mean(), 1)

5.8

In [371]:
iris['petal_length'] = iris['petal_length'].fillna(value=round(iris['petal_length'].mean(), 1))

In [373]:
round(iris['petal_length'].mean(), 1)

3.7

In [374]:
iris.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

# Concatenate/Merge Dataframes

In [376]:
new_df = pd.DataFrame( np.random.randint(0, 7, size=(10,4)))

In [377]:
new_df

Unnamed: 0,0,1,2,3
0,1,0,0,2
1,0,4,3,6
2,4,1,2,3
3,6,6,0,3
4,1,3,6,5
5,6,2,4,5
6,2,3,0,3
7,0,5,3,5
8,4,5,4,6
9,0,4,0,3


In [378]:
new_df['species'] = 'roses'

In [379]:
new_df.head()

Unnamed: 0,0,1,2,3,species
0,1,0,0,2,roses
1,0,4,3,6,roses
2,4,1,2,3,roses
3,6,6,0,3,roses
4,1,3,6,5,roses


In [381]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [382]:
new_df.columns = iris.columns

In [383]:
new_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,1,0,0,2,roses
1,0,4,3,6,roses
2,4,1,2,3,roses
3,6,6,0,3,roses
4,1,3,6,5,roses


In [385]:
# To Concatenate Vertically axis = 0, and horizontally axis = 1

In [387]:
iris = pd.concat((iris, new_df), axis = 0)

In [389]:
iris.shape

(160, 5)

# Merging Dataframes

In [396]:
df1 = pd.DataFrame({'S_Name' : ["Prateek", "Mohit", "Jatin", "Shubhang"],
                   "CGPA": [2, 4, 5, 3]})

In [397]:
df1

Unnamed: 0,S_Name,CGPA
0,Prateek,2
1,Mohit,4
2,Jatin,5
3,Shubhang,3


In [398]:
df2 = pd.DataFrame({'T_Name' : ["Prateek", "Mohit", "Jatin", "Shubhang"],
                   "CGPA": [3, 6, 8, 9]})

In [399]:
df2

Unnamed: 0,T_Name,CGPA
0,Prateek,3
1,Mohit,6
2,Jatin,8
3,Shubhang,9


In [400]:
df1.merge(df2, how = 'inner')

Unnamed: 0,S_Name,CGPA,T_Name
0,Shubhang,3,Prateek


In [401]:
df1.merge(df2, how = 'left')

Unnamed: 0,S_Name,CGPA,T_Name
0,Prateek,2,
1,Mohit,4,
2,Jatin,5,
3,Shubhang,3,Prateek


In [402]:
df1.merge(df2, how = 'right')

Unnamed: 0,S_Name,CGPA,T_Name
0,Shubhang,3,Prateek
1,,6,Mohit
2,,8,Jatin
3,,9,Shubhang


In [403]:
df1.merge(df2, how = 'outer')

Unnamed: 0,S_Name,CGPA,T_Name
0,Prateek,2,
1,Mohit,4,
2,Jatin,5,
3,Shubhang,3,Prateek
4,,6,Mohit
5,,8,Jatin
6,,9,Shubhang


# Output Files

In [404]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
5,6.0,2.0,4.0,5.0,roses
6,2.0,3.0,0.0,3.0,roses
7,0.0,5.0,3.0,5.0,roses
8,4.0,5.0,4.0,6.0,roses


In [405]:
# It will create csv file in local storage with index value
iris.to_csv('./modified_iris_withIndex')

In [406]:
modified_file = pd.read_csv('./modified_iris_withIndex')

In [407]:
modified_file

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...,...
155,5,6.0,2.0,4.0,5.0,roses
156,6,2.0,3.0,0.0,3.0,roses
157,7,0.0,5.0,3.0,5.0,roses
158,8,4.0,5.0,4.0,6.0,roses


In [412]:
# So to solve this without index file
iris.to_csv('./modified_iris_withOutIndex', index=False)

In [413]:
modified_file = pd.read_csv('./modified_iris_withOutIndex')

In [414]:
modified_file

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
155,6.0,2.0,4.0,5.0,roses
156,2.0,3.0,0.0,3.0,roses
157,0.0,5.0,3.0,5.0,roses
158,4.0,5.0,4.0,6.0,roses
