# Data Pre processing | Data Cleaning


## Handling duplicates

In [1]:
import pandas as pd
data = {
    "ID" : [1, 2, 3, 1, 4, 2, 5],
    "Name" : ["Alice", "Bob", "Charlie", "Alice", "David", "Bob", "Eve"],
    "Age" : [25, 30, 35, 25, 40, 30, 28]
}

df = pd.DataFrame(data)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      7 non-null      int64 
 1   Name    7 non-null      object
 2   Age     7 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 300.0+ bytes


In [3]:
# Contain duplicates
df_duplicates = df[df.duplicated()]

df_duplicates

Unnamed: 0,ID,Name,Age
3,1,Alice,25
5,2,Bob,30


In [4]:
df.drop_duplicates(inplace = True)

In [5]:
df.head(10)

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,35
4,4,David,40
6,5,Eve,28


## Drop unnecessary columns

In [1]:
import pandas as pd
data = {
    "ID" : [1, 2, 3],
    "Name" : ["Alice", "Bob", "Charlie"],
    "Salary" : [50000, 60000, 70000]
}

df = pd.DataFrame(data)

In [2]:
# Dropping the unnecessary column
df.drop(columns = "ID", inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Salary  3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


## Handle missing values

In [6]:
import pandas as pd

# Create a sample dataset with missing values
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Age': [25, 30, None, 22, 28],
        'Salary': [50000, None, 60000, 45000, 55000],
        'City': ['New York', 'San Francisco', 'Los Angeles', None, 'Chicago']}

df = pd.DataFrame(data)
print("Original Dataset:")
df.info()

Original Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Salary  4 non-null      float64
 3   City    4 non-null      object 
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes


In [7]:
df.head()

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,30.0,,San Francisco
2,Charlie,,60000.0,Los Angeles
3,David,22.0,45000.0,
4,Eva,28.0,55000.0,Chicago


In [10]:
# 1---> Drop rows with NaN
after_dropped_rows = df.dropna()

In [11]:
after_dropped_rows

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
4,Eva,28.0,55000.0,Chicago


In [12]:
# 2---> Replace missing values
df_filled = df.copy()
df_filled["Age"].fillna(df["Age"].mean(), inplace = True)
df_filled["Salary"].fillna(df["Salary"].median(), inplace = True)

In [13]:
df_filled

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,30.0,52500.0,San Francisco
2,Charlie,26.25,60000.0,Los Angeles
3,David,22.0,45000.0,
4,Eva,28.0,55000.0,Chicago


# Data Type Conversion

In [14]:
import pandas as pd

# Create a sample dataset with a column in object type that should be in float
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Age': ['25', '30', 'None', '22', '28'],  # Age is represented as strings
        'Salary': [50000, 60000, 70000, 45000, 55000]}

df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      object
 2   Salary  5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


In [15]:
# Convert the "Age" column type
df["Age"] = pd.to_numeric(df["Age"], errors = "coerce")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Salary  5 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 252.0+ bytes


In [17]:
df["Age"] = df["Age"].astype(str)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      object
 2   Salary  5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


# Check for Inconsistencies in Naming Conventions

In [19]:
import pandas as pd

# Create a sample dataset with a 'Name' column containing inconsistencies
data = {'Name': ['Alice', 'bob', 'Charlie', 'david', 'Eva']}
df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
print(df)

Original Dataset:
      Name
0    Alice
1      bob
2  Charlie
3    david
4      Eva


In [20]:
df["Name"] = df["Name"].apply(lambda x: x.title())

In [21]:
df

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie
3,David
4,Eva


In [22]:
import pandas as pd

# Create a sample dataset with columns containing white spaces
data = {'First Name': ['Alice', 'Bob', 'Charlie'],
        'Last Name': ['Smith', 'Jones', 'Brown'],
        'Age Group': ['Young', 'Middle', 'Young']}

df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
print(df)

Original Dataset:
  First Name Last Name Age Group
0      Alice     Smith     Young
1        Bob     Jones    Middle
2    Charlie     Brown     Young


In [25]:
df.columns = df.columns.str.replace(" ", "_").str.lower()

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  3 non-null      object
 1   last_name   3 non-null      object
 2   age_group   3 non-null      object
dtypes: object(3)
memory usage: 204.0+ bytes
