In [2]:
import pandas as pd
import numpy as np

In [2]:
data = [
    ["John", 98556556846],
    ["Harsh", 64524248226],
    ["Heet", 89824565135],
    ["Dhruvi", 48486318987]
]

In [3]:
df = pd.DataFrame(data, columns=["Name", "Phone Number"])

In [4]:
df

Unnamed: 0,Name,Phone Number
0,John,98556556846
1,Harsh,64524248226
2,Heet,89824565135
3,Dhruvi,48486318987


In [5]:
df["Age"] = [45,32,12,87]

In [6]:
df

Unnamed: 0,Name,Phone Number,Age
0,John,98556556846,45
1,Harsh,64524248226,32
2,Heet,89824565135,12
3,Dhruvi,48486318987,87


In [7]:
df.insert(3, "Marks", [98,87,95,89])

In [8]:
df

Unnamed: 0,Name,Phone Number,Age,Marks
0,John,98556556846,45,98
1,Harsh,64524248226,32,87
2,Heet,89824565135,12,95
3,Dhruvi,48486318987,87,89


In [9]:
df.drop("Marks", axis=1)

Unnamed: 0,Name,Phone Number,Age
0,John,98556556846,45
1,Harsh,64524248226,32
2,Heet,89824565135,12
3,Dhruvi,48486318987,87


In [10]:
df.drop("Marks", axis=1, inplace=True)

In [11]:
df

Unnamed: 0,Name,Phone Number,Age
0,John,98556556846,45
1,Harsh,64524248226,32
2,Heet,89824565135,12
3,Dhruvi,48486318987,87


# Data Cleaning

In [3]:
data = {
    'a' : [7, 3, np.NaN],
    'b' : [3, np.NaN, np.NaN],
    'c' : [9, 3, 1],    
}

In [4]:
data

{'a': [7, 3, nan], 'b': [3, nan, nan], 'c': [9, 3, 1]}

In [9]:
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,7.0,3.0,9
1,3.0,,3
2,,,1


In [8]:
# Removing null values
# It only gives df removing df
# It doesn't change df

df.dropna()

Unnamed: 0,a,b,c
0,7.0,3.0,9


In [10]:
df

Unnamed: 0,a,b,c
0,7.0,3.0,9
1,3.0,,3
2,,,1


In [14]:
# Replacing the null values
# It doesn't change df

df.fillna(9999)

Unnamed: 0,a,b,c
0,7.0,3.0,9
1,3.0,9999.0,3
2,9999.0,9999.0,1


In [15]:
df

Unnamed: 0,a,b,c
0,7.0,3.0,9
1,3.0,,3
2,,,1


In [16]:
df['c'].mean()

4.333333333333333

In [17]:
np.mean(df['c'])

4.333333333333333

In [19]:
df['a'] = df['a'].fillna(df['a'].mean())

In [20]:
df

Unnamed: 0,a,b,c
0,7.0,3.0,9
1,3.0,,3
2,5.0,,1


In [21]:
df['b'] = df['b'].fillna(df['b'].mean())

In [22]:
df

Unnamed: 0,a,b,c
0,7.0,3.0,9
1,3.0,3.0,3
2,5.0,3.0,1


In [23]:
pd.read_csv('mydataset.csv')

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,,M,6.0,80.0
2,Charlie,35.0,,,77.0
3,David,45.0,M,5.8,
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0
8,Alice,25.0,F,5.5,65.0


In [28]:
mydf = pd.read_excel('cwc_winners.xlsx')

In [32]:
mydf['Winner'][2]

'Ind'

In [34]:
df.to_csv('mynewfile.csv', index=False)

In [45]:
data = pd.read_csv('mydataset.csv')
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,,M,6.0,80.0
2,Charlie,35.0,,,77.0
3,David,45.0,M,5.8,
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0
8,Alice,25.0,F,5.5,65.0


In [46]:
data['Age'] = data['Age'].fillna(data['Age'].mean())
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,40.625,M,6.0,80.0
2,Charlie,35.0,,,77.0
3,David,45.0,M,5.8,
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0
8,Alice,25.0,F,5.5,65.0


In [49]:
data['Gender'].mode()

0    M
Name: Gender, dtype: object

In [50]:
data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,40.625,M,6.0,80.0
2,Charlie,35.0,M,,77.0
3,David,45.0,M,5.8,
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0
8,Alice,25.0,F,5.5,65.0


In [51]:
data['Height'] = data['Height'].fillna(data['Height'].mean())
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,40.625,M,6.0,80.0
2,Charlie,35.0,M,5.8625,77.0
3,David,45.0,M,5.8,
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0
8,Alice,25.0,F,5.5,65.0


In [52]:
data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,40.625,M,6.0,80.0
2,Charlie,35.0,M,5.8625,77.0
3,David,45.0,M,5.8,77.125
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0
8,Alice,25.0,F,5.5,65.0


In [55]:
data = data.drop_duplicates()
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25.0,F,5.5,65.0
1,Bob,40.625,M,6.0,80.0
2,Charlie,35.0,M,5.8625,77.0
3,David,45.0,M,5.8,77.125
4,Edward,55.0,M,5.9,88.0
5,Frank,40.0,M,6.1,90.0
6,George,65.0,M,6.2,75.0
7,Hank,35.0,M,5.9,77.0


In [57]:
data.dtypes

Name       object
Age       float64
Gender     object
Height    float64
Weight    float64
dtype: object

In [60]:
data['Age'] = data['Age'].astype(int)
data

Unnamed: 0,Name,Age,Gender,Height,Weight
0,Alice,25,F,5.5,65.0
1,Bob,40,M,6.0,80.0
2,Charlie,35,M,5.8625,77.0
3,David,45,M,5.8,77.125
4,Edward,55,M,5.9,88.0
5,Frank,40,M,6.1,90.0
6,George,65,M,6.2,75.0
7,Hank,35,M,5.9,77.0


In [61]:
data.dtypes

Name       object
Age         int32
Gender     object
Height    float64
Weight    float64
dtype: object

In [62]:
data.to_csv('cleandataset.csv', index=False)