# ================ Data Cleaning ================

In [1]:
import pandas as pd

## ----------- Handling Missing Data -----------
> Missing values often appear as NaN (Not a Number).

In [None]:
df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


#### Ensure date date-type

In [377]:
print(type(df.day[0]))
df = pd.read_csv("weather_data.csv",parse_dates=['day'])
print(type(df.day[0]))
df

<class 'float'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


  print(type(df.day[0]))


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


#### Use date col as index

In [378]:
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### ---------- Check for missing values ----------

In [268]:
print(df.isnull())          # Shows True/False for missing values
print("\n----------------------")
print(df.isnull().sum())    # Count missing values per column
print("\n----------------------")
df.notnull()                # Opposite of isnull()

            temperature  windspeed  event
day                                      
2017-01-01        False      False  False
2017-01-04         True      False  False
2017-01-05        False       True  False
2017-01-06         True      False   True
2017-01-07        False       True  False
2017-01-08         True       True  False
2017-01-09         True       True   True
2017-01-10        False      False  False
2017-01-11        False      False  False

----------------------
temperature    4
windspeed      4
event          2
dtype: int64

----------------------


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,True,True,True
2017-01-04,False,True,True
2017-01-05,True,False,True
2017-01-06,False,True,False
2017-01-07,True,False,True
2017-01-08,False,False,True
2017-01-09,False,False,False
2017-01-10,True,True,True
2017-01-11,True,True,True


In [383]:
print(df.isna())          # Shows True/False for missing values
print("\n----------------------")
print(df.isna().sum())    # Count missing values per column
print("\n----------------------")
df.notna()                # Opposite of isnull()

            temperature  windspeed  event
2017-01-01        False      False  False
2017-01-02         True       True   True
2017-01-03         True       True   True
2017-01-04         True      False  False
2017-01-05        False       True  False
2017-01-06         True      False   True
2017-01-07        False       True  False
2017-01-08         True       True  False
2017-01-09         True       True   True
2017-01-10        False      False  False
2017-01-11        False      False  False

----------------------
temperature    6
windspeed      6
event          4
dtype: int64

----------------------


Unnamed: 0,temperature,windspeed,event
2017-01-01,True,True,True
2017-01-02,False,False,False
2017-01-03,False,False,False
2017-01-04,False,True,True
2017-01-05,True,False,True
2017-01-06,False,True,False
2017-01-07,True,False,True
2017-01-08,False,False,True
2017-01-09,False,False,False
2017-01-10,True,True,True


### ---------- Drop missing values ----------
> DataFrame.dropna(*, axis=0/"rows", how='any', thresh=<no_default>, subset=None, inplace=False, ignore_index=False)

In [269]:
df.dropna()                # Drop rows with ANY missing values

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [329]:
df.dropna(axis=1)      # Determine which axis to remove

Unnamed: 0,day
0,1/1/2017
1,1/4/2017
2,1/5/2017
3,1/6/2017
4,1/7/2017
5,1/8/2017
6,1/9/2017
7,1/10/2017
8,1/11/2017


In [330]:
df.dropna(axis="columns")

Unnamed: 0,day
0,1/1/2017
1,1/4/2017
2,1/5/2017
3,1/6/2017
4,1/7/2017
5,1/8/2017
6,1/9/2017
7,1/10/2017
8,1/11/2017


In [318]:
df.dropna(how='all')       # Drop rows only if ALL values are missing

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [333]:
df.dropna(subset=['event', 'temperature']) # Drop rows only if val missing in the certain col

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
2,1/5/2017,28.0,,Snow
4,1/7/2017,32.0,,Rain
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [328]:
# ❗❗ Cannot be combind with 'how'❗❗
df.dropna(thresh=2) # Among consecutive NaNs, drop only n(rows) = threshold value only

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


### Fill missing values

#### ------ fillna() ------
> DataFrame.fillna(value=None, *, method=None, axis=None, inplace=False, limit=None, downcast=<no_default>)

In [335]:
# Filling '0' in place of on all occurances of NaN
df.fillna(0)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,0.0,9.0,Sunny
2,1/5/2017,28.0,0.0,Snow
3,1/6/2017,0.0,7.0,0
4,1/7/2017,32.0,0.0,Rain
5,1/8/2017,0.0,0.0,Sunny
6,1/9/2017,0.0,0.0,0
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [344]:
df.fillna(0, limit=2)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,0.0,9.0,Sunny
2,1/5/2017,28.0,0.0,Snow
3,1/6/2017,0.0,7.0,0
4,1/7/2017,32.0,0.0,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,0
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [336]:
# ❓❓ Why not 0.0 this tie ❓❓
df.fillna(0, axis=1)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,0.0,9.0,Sunny
2,1/5/2017,28.0,0.0,Snow
3,1/6/2017,0.0,7.0,0
4,1/7/2017,32.0,0.0,Rain
5,1/8/2017,0.0,0.0,Sunny
6,1/9/2017,0.0,0.0,0
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [341]:
df.fillna(0, axis=1, limit=1)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,0.0,9.0,Sunny
2,1/5/2017,28.0,0.0,Snow
3,1/6/2017,0.0,7.0,
4,1/7/2017,32.0,0.0,Rain
5,1/8/2017,0.0,,Sunny
6,1/9/2017,0.0,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [342]:
df.fillna(0, axis=1, limit=2)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,0.0,9.0,Sunny
2,1/5/2017,28.0,0.0,Snow
3,1/6/2017,0.0,7.0,0
4,1/7/2017,32.0,0.0,Rain
5,1/8/2017,0.0,0.0,Sunny
6,1/9/2017,0.0,0.0,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


#### > Fill NaN values with specific values for particular columns

In [275]:
new_df = df.fillna({
        'temperature': 0,
        'windspeed': 0,
        'event': 'No Event'
    })
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,No Event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,No Event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


#### > Filling NaNs with values comparitive to other cells instead of fixed values

In [351]:
# df['temperature'].fillna(df['temperature'].mean(), inplace=True)   # ❗❗NOT THE PREFERRED WAY❗❗
df.fillna({'temperature': df['temperature'].mean()}, inplace=True)   # Fill with mean
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,33.2,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,33.2,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,33.2,,Sunny
6,1/9/2017,33.2,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


##### method & axis

In [277]:
new_df = df.fillna(method="ffill")   # fill in 'top-to-bottom' manner
# OR
# new_df = df.fillna(method="ffill", axis="index")   # fill in 'top-to-bottom' manner
# OR
# new_df = df.fillna(method="bfill")   # fill in 'bottom-to-top' manner
# OR
# new_df = df.fillna(method="ffill", axis=1)   # fill in 'left-to-right' manner
# OR
# new_df = df.fillna(method="bfill", axis=1)   # fill in 'right-to-left' manner
# OR
# new_df = df.fillna(method="bfill", axis="columns")   # fill in 'right-to-left' manner
new_df

  new_df = df.fillna(method="ffill")   # fill in 'top-to-bottom' manner


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,33.2,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,33.2,7.0,Sunny
2017-01-09,33.2,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


##### limit

In [281]:
new_df = df.fillna(method="ffill",limit=1)
new_df

  new_df = df.fillna(method="ffill",limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,33.2,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,33.2,,Sunny
2017-01-09,33.2,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


#### Non-deprecating methods

In [282]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,33.2,7.0,
2017-01-07,32.0,,Rain
2017-01-08,33.2,,Sunny
2017-01-09,33.2,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [283]:
df.ffill()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,33.2,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,33.2,7.0,Sunny
2017-01-09,33.2,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [284]:
df.bfill()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,33.2,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,33.2,8.0,Sunny
2017-01-09,33.2,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [285]:
df.ffill(limit=1)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.2,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,33.2,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,33.2,,Sunny
2017-01-09,33.2,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [352]:
df.ffill(limit=1, axis=1)

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,33.2,9.0,Sunny
2,1/5/2017,28.0,28.0,Snow
3,1/6/2017,33.2,7.0,7.0
4,1/7/2017,32.0,32.0,Rain
5,1/8/2017,33.2,33.2,Sunny
6,1/9/2017,33.2,33.2,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


### ------ interpolate() ------
> DataFrame.interpolate(method='linear', *, axis=0, limit=None, inplace=False, limit_direction=None, limit_area=None, downcast=<no_default>, **kwargs)

In [368]:
# ❗❗FutureWarning: DataFrame.interpolate with object dtype is deprecated and will raise in a future version. Call obj.infer_objects(copy=False) before interpolating instead.❗❗
new_df = df.interpolate()
new_df


  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32.0,6.0,Rain
1/4/2017,33.2,9.0,Sunny
1/5/2017,28.0,8.0,Snow
1/6/2017,33.2,7.0,
1/7/2017,32.0,7.25,Rain
1/8/2017,33.2,7.5,Sunny
1/9/2017,33.2,7.75,
1/10/2017,34.0,8.0,Cloudy
1/11/2017,40.0,12.0,Sunny


In [364]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32.0,6.0,Rain
1/4/2017,33.2,9.0,Sunny
1/5/2017,28.0,,Snow
1/6/2017,33.2,7.0,
1/7/2017,32.0,,Rain
1/8/2017,33.2,,Sunny
1/9/2017,33.2,,
1/10/2017,34.0,8.0,Cloudy
1/11/2017,40.0,12.0,Sunny


In [369]:
new_df = df.interpolate(method="time") 
new_df

  new_df = df.interpolate(method="time")


ValueError: time-weighted interpolation only works on Series or DataFrames with a DatetimeIndex

In [366]:
new_df = df.interpolate(method="quadratic") 
new_df

  new_df = df.interpolate(method="quadratic")


ValueError: Index column must be numeric or datetime type when using quadratic method other than linear. Try setting a numeric or datetime index column before interpolating.

### ------ Inserting Missing Rows for Continuity in Dates ------

In [379]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [380]:
dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)
df = df.reindex(idx)
df

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


## 2. Handling Duplicates
> When datasets have repeated rows

In [2]:
df2 = pd.DataFrame({
    'ID': [1,2,2,3,4,4],
    'Name': ['A','B','B','C','D','D']
})
df2

Unnamed: 0,ID,Name
0,1,A
1,2,B
2,2,B
3,3,C
4,4,D
5,4,D


### Find duplicates

##### count()
> .count() counts non-null values in each column

> Hence must specify col-name

> .count() ignores NaN values 

In [None]:
df2.count()   # Count duplicates

ID      6
Name    6
dtype: int64

In [9]:
df2.groupby('Name').count()

Unnamed: 0_level_0,ID
Name,Unnamed: 1_level_1
A,1
B,2
C,1
D,2


##### value_counts() 
> This one just counts how often each unique value appears in a column.

In [6]:
df2['Name'].value_counts()

Name
B    2
D    2
A    1
C    1
Name: count, dtype: int64

##### duplicated()

In [None]:
df2.duplicated()         # Returns True if answer for the question is yes:- Have I seen this value before?

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

In [15]:
dupes = df2.duplicated(keep=False)         # it marks all duplicates as True
dupes

0    False
1     True
2     True
3    False
4     True
5     True
dtype: bool

In [13]:
dupes = df2[df2.duplicated("email", keep=False)]
dupes

KeyError: Index(['email'], dtype='object')

In [16]:
dupes[["email"]].drop_duplicates()

KeyError: "None of [Index(['email'], dtype='object')] are in the [index]"

In [None]:
df2.duplicated().sum()   # Count duplicates

np.int64(2)

### Drop duplicates
> DataFrame.drop_duplicates(subset=None, *, keep='first', inplace=False, ignore_index=False)

In [302]:
df2.drop_duplicates()         # Keep first occurrence

Unnamed: 0,ID,Name
0,1,A
1,2,B
3,3,C
4,4,D


In [303]:
df2.drop_duplicates(keep='last')  # Keep last occurrence

Unnamed: 0,ID,Name
0,1,A
2,2,B
3,3,C
5,4,D


In [382]:
df2.drop_duplicates(keep=False)  # Drop all duplicates.

Unnamed: 0,ID,Name
0,1,A
3,3,C


In [304]:
df2.drop_duplicates(subset=['Name'])  # Check only specific column

Unnamed: 0,ID,Name
0,1,A
1,2,B
3,3,C
4,4,D


## 3. Handling Inconsistent Data
> Handling values that are not uniform

In [305]:
df3 = pd.DataFrame({
    'City': ['delhi', 'Delhi ', 'MUMBAI', 'mumbai ', 'Bangalore']
})
df3

Unnamed: 0,City
0,delhi
1,Delhi
2,MUMBAI
3,mumbai
4,Bangalore


### Strip extra spaces

In [306]:
df3['City'] = df3['City'].str.strip()
df3

Unnamed: 0,City
0,delhi
1,Delhi
2,MUMBAI
3,mumbai
4,Bangalore


### Make consistent case

In [307]:
df3['City'] = df3['City'].str.title()   # First letter uppercase
df3

Unnamed: 0,City
0,Delhi
1,Delhi
2,Mumbai
3,Mumbai
4,Bangalore


In [308]:
df3['City'] = df3['City'].str.lower()   # all lowercase
df3

Unnamed: 0,City
0,delhi
1,delhi
2,mumbai
3,mumbai
4,bangalore


In [309]:
df3['City'] = df3['City'].str.upper()   # all uppercase
df3

Unnamed: 0,City
0,DELHI
1,DELHI
2,MUMBAI
3,MUMBAI
4,BANGALORE


## 4. Detecting Outliers
> Handling values that are way off

In [310]:
df4 = pd.DataFrame({
    'Salary': [30000, 40000, 50000, 1000000]   # 1M looks like an outlier
})
df4

# Simple rule: Z-score or IQR can be used (advanced part)

Unnamed: 0,Salary
0,30000
1,40000
2,50000
3,1000000


In [311]:
df4.describe()   # Spot extreme values

Unnamed: 0,Salary
count,4.0
mean,280000.0
std,480069.439422
min,30000.0
25%,37500.0
50%,45000.0
75%,287500.0
max,1000000.0
