# **<center><span style= "color:#2F539B;">Handling Inconsistencies in data</span></center>**

## ***<span style= "color:maroon;">Contents </span>***
- ***`Imports`*** 
- **`Handling missing values with pandas methods`** 
- **`Handling with sk_learn`**
- **`Handling with KKN algo`**

In [None]:
import numpy as np
import pandas as pd

In [36]:
data = {
    'date': ['2021-12-01', '01-12-2022', '2022/12/01', '12-01-2021'],
    'country': ['USA', 'U.S.A.', 'America', 'United States'],
    'name': ['Aammar', 'Amaar', 'Hamza', 'Hazma'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
}
df = pd.DataFrame(data)

In [37]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,01-12-2022,U.S.A.,Amaar,200.0,150.0
2,2022/12/01,America,Hamza,,300.0
3,12-01-2021,United States,Hazma,200.0,150.0


In [38]:
# standardizing the data format
df['date'] = pd.to_datetime(df['date'], errors= 'coerce')
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

In [39]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,,U.S.A.,Amaar,200.0,150.0
2,,America,Hamza,,300.0
3,,United States,Hazma,200.0,150.0


In [40]:
df['date'] = df['date'].ffill()

In [41]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,2021-12-01,U.S.A.,Amaar,200.0,150.0
2,2021-12-01,America,Hamza,,300.0
3,2021-12-01,United States,Hazma,200.0,150.0


In [42]:
# Harmonize the name of the country
country_mapping = {
    'USA': 'Unitied States',
    'U.S.A.': 'Unitied States',
    'America': 'Unitied States',
}
df['country'] = df['country'].replace(country_mapping)

In [43]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,Unitied States,Aammar,100.0,
1,2021-12-01,Unitied States,Amaar,200.0,150.0
2,2021-12-01,Unitied States,Hamza,,300.0
3,2021-12-01,United States,Hazma,200.0,150.0


In [44]:
df['name'] = df['name'].replace({'Hazma': 'Hamza','Aammar': 'Amaar'})

In [45]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,Unitied States,Amaar,100.0,
1,2021-12-01,Unitied States,Amaar,200.0,150.0
2,2021-12-01,Unitied States,Hamza,,300.0
3,2021-12-01,United States,Hamza,200.0,150.0


In [34]:
#drop duplicates
df = df.drop_duplicates(subset='name')

In [46]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,Unitied States,Amaar,100.0,
1,2021-12-01,Unitied States,Amaar,200.0,150.0
2,2021-12-01,Unitied States,Hamza,,300.0
3,2021-12-01,United States,Hamza,200.0,150.0


In [47]:
# resolving the contradictory
# lets assume sales_2021 should always be higher than sales_2020 
# we'll just remove rows where this condition is not met
df = df.drop(df[df['sales_2021'] <= df['sales_2020']].index)

In [48]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,Unitied States,Amaar,100.0,
2,2021-12-01,Unitied States,Hamza,,300.0
