# Data Pre-Processing || Data Cleaning
🦊 `Notebook by` [Md.Samiul Alim](https://github.com/sami0055)

😋  `Machine Learning Source Codes` [GitHub](https://github.com/sami0055/Machine-Learning)

## Handle Duplicates

In [1]:
import pandas as pd
data={
    'ID':[1,2,3,1,4,2,5],
    'Name':['Alice','Bob','Charlie','Alice','David','Bob','Eve'],
    'Age':[25,30,35,25,40,30,28]
}

In [2]:
df=pd.DataFrame(data)
df

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,35
3,1,Alice,25
4,4,David,40
5,2,Bob,30
6,5,Eve,28


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      7 non-null      int64 
 1   Name    7 non-null      object
 2   Age     7 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 300.0+ bytes


In [4]:
df.head()

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,35
3,1,Alice,25
4,4,David,40


In [5]:
duplicate_rows=df[df.duplicated()]
duplicate_rows

Unnamed: 0,ID,Name,Age
3,1,Alice,25
5,2,Bob,30


In [6]:
count_duplicates=df.duplicated().sum()

In [7]:
count_duplicates

2

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,35
4,4,David,40
6,5,Eve,28


### Drop unnecessary Colums

In [10]:
data={
    'id':[1,2,3],
    'Name':['Alice','Bob','Charlie'],
    'Salary':[50000,60000,70000]
}

In [11]:
df=pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3 non-null      int64 
 1   Name    3 non-null      object
 2   Salary  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 204.0+ bytes


In [14]:
df.drop(columns=['id'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Salary  3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


## Handle Missing Values

In [15]:
import pandas as pd

In [16]:
data={
    'Name':['Alice','Bob','Charlie','David','Eva'],
    'Age':[25,30,None,22,28],
    'Salary':[50000, None, 60000, 45000, 55000],
    'City':['New York', 'San Francisco', 'Los Angeles', None, 'Chicago']
}

In [17]:
df=pd.DataFrame(data)
print("Original Dataset")
df.info()

Original Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Salary  4 non-null      float64
 3   City    4 non-null      object 
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes


In [18]:
#Drop rows with missing values
df_cleaned=df.dropna()
df_cleaned

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
4,Eva,28.0,55000.0,Chicago


In [19]:
df.head()

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,30.0,,San Francisco
2,Charlie,,60000.0,Los Angeles
3,David,22.0,45000.0,
4,Eva,28.0,55000.0,Chicago


In [None]:
#Replacing missing values
df_copy=df.copy()
df_copy['Age'].fillna(df_copy['Age'].mean(),inplace=True)
df_copy['Salary'].fillna(df_copy['Salary'].median(),inplace=True)
df_copy

## Data Type Conversions

In [22]:
# Create a sample dataset with a column in object type that should be in float
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Age': ['25', '30', 'None', '22', '28'],  # Age is represented as strings
        'Salary': [50000, 60000, 70000, 45000, 55000]
       }

df=pd.DataFrame(data)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      object
 2   Salary  5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


In [23]:
#Convert the Age column dtype
df['Age']=pd.to_numeric(df['Age'],errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Salary  5 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 252.0+ bytes


In [24]:
#Numeric to string conversion
data={
    'numeric_column':[1,2,3,4,5]
}
df=pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   numeric_column  5 non-null      int64
dtypes: int64(1)
memory usage: 172.0 bytes


In [25]:
df['numeric_column']=df['numeric_column'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   numeric_column  5 non-null      object
dtypes: object(1)
memory usage: 172.0+ bytes


## Check for Inconsistancies in Naming Conventions

In [8]:
import pandas as pd
# Create a sample dataset with a Name column contaning inconsistance
data={
    'Name':['Alice','bob bob','Charlie','david','Eva']
}
df=pd.DataFrame(data)

In [9]:
df

Unnamed: 0,Name
0,Alice
1,bob bob
2,Charlie
3,david
4,Eva


In [10]:
#Check for inconsistance in naming conversions and format
df['Name']=df['Name'].apply(lambda x:x.title())

In [11]:
df

Unnamed: 0,Name
0,Alice
1,Bob Bob
2,Charlie
3,David
4,Eva


## Check for Feature Naming Formats

In [5]:
data={
    'First Name':['Alice','Bob','Charlie'],
    'Last Name':['Smith','Jones','Brown'],
    'Age Group':['Young','Middle','Yound']
}
df=pd.DataFrame(data)

In [6]:
df

Unnamed: 0,First Name,Last Name,Age Group
0,Alice,Smith,Young
1,Bob,Jones,Middle
2,Charlie,Brown,Yound


In [7]:
df.columns=df.columns.str.replace(' ','_').str.lower()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  3 non-null      object
 1   last_name   3 non-null      object
 2   age_group   3 non-null      object
dtypes: object(3)
memory usage: 204.0+ bytes
