In [1]:
# Part 1: Handle Missing Values & Duplicates

    # Step-by-Step Guidelines:
# 1. Load the Data: First, ensure you have pandas installed and import it.
# 2. Handling Missing Values
#     1. Identify Missing Values:
#     2. Fill Missing Values:
# 3. Handling Duplicates
#     1. Identify Duplicates:
#     2. Remove Duplicates:
# 4. Combined Practice on a New Dataset
#     1. New Sample Data:
#     2. Handling Missing Values:
#     3. Remove Duplicates:
        
        
        
        
        
        
        
        
        

In [2]:
import pandas as pd
import numpy as np

# Step 1: Load the Data
# For this exercise, let's create a sample DataFrame directly
data = {'ID': [1, 2, 3, 4, 5, 6, 2, 7],
        'Name': ['Alice', 'Bob', 'Charlie', np.nan, 'Eve', 'Bob', 'Bob', 'Frank'],
        'Age': [25, 30, np.nan, 22, 35, 30, 30, np.nan],
        'City': ['New York', 'London', 'Paris', 'London', 'Tokyo', 'London', 'London', 'Berlin']}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# 2. Handling Missing Values
print("\n2. Handling Missing Values")

# 2.1. Identify Missing Values:
print("\n   1. Identify Missing Values:")
print(df.isnull())  # Boolean mask indicating missing values
print("\nNumber of missing values per column:")
print(df.isnull().sum())
print("\nTotal number of missing values:")
print(df.isnull().sum().sum())

# 2.2. Fill Missing Values:
print("\n   2. Fill Missing Values:")

# Example 1: Fill all NaN values with a specific value (e.g., 'Unknown')
df_filled_value = df.fillna('Unknown')
print("\nDataFrame filled with 'Unknown':")
print(df_filled_value)

# Let's work with a copy of the original DataFrame for other filling strategies
df_filled = df.copy()

# Example 2: Fill NaN in 'Name' with 'Anonymous'
df_filled['Name'].fillna('Anonymous', inplace=True)
print("\nDataFrame with 'Name' NaN filled with 'Anonymous':")
print(df_filled)

# Example 3: Fill NaN in 'Age' with the mean age
mean_age = df_filled['Age'].mean()
df_filled['Age'].fillna(mean_age, inplace=True)
print("\nDataFrame with 'Age' NaN filled with the mean age:")
print(df_filled)

# Example 4: Fill NaN in 'City' with the most frequent city (mode)
mode_city = df['City'].mode()[0]  # mode() returns a Series, so we take the first element
df_filled['City'].fillna(mode_city, inplace=True)
print("\nDataFrame with 'City' NaN filled with the mode:")
print(df_filled)

# 3. Handling Duplicates
print("\n3. Handling Duplicates")

# Let's revert to the original DataFrame to identify duplicates
print("\nOriginal DataFrame (for duplicate handling):")
print(df)

# 3.1. Identify Duplicates:
print("\n   1. Identify Duplicates:")
print("\nAre there any duplicate rows?")
print(df.duplicated())  # Boolean Series indicating duplicate rows (True for all but the first occurrence)

print("\nRows that are duplicates:")
print(df[df.duplicated()])

print("\nDuplicates based on specific columns ('Name' and 'Age'):")
print(df.duplicated(subset=['Name', 'Age']))
print("\nRows that are duplicates based on 'Name' and 'Age':")
print(df[df.duplicated(subset=['Name', 'Age'])])

# 3.2. Remove Duplicates:
print("\n   2. Remove Duplicates:")

# Example 1: Remove all duplicate rows, keeping the first occurrence
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame with duplicate rows removed (keeping first):")
print(df_no_duplicates)

# Example 2: Remove duplicates based on specific columns, keeping the first occurrence
df_no_duplicates_subset = df.drop_duplicates(subset=['Name', 'Age'])
print("\nDataFrame with duplicates removed based on 'Name' and 'Age' (keeping first):")
print(df_no_duplicates_subset)

# Example 3: Remove duplicates based on specific columns, keeping the last occurrence
df_no_duplicates_last = df.drop_duplicates(subset=['Name', 'Age'], keep='last')
print("\nDataFrame with duplicates removed based on 'Name' and 'Age' (keeping last):")
print(df_no_duplicates_last)

# 4. Combined Practice on a New Dataset
print("\n4. Combined Practice on a New Dataset")

# 4.1. New Sample Data:
print("\n   1. New Sample Data:")
new_data = {'Product': ['Laptop', 'Tablet', 'Laptop', np.nan, 'Smartphone', 'Tablet', 'Charger', 'Laptop'],
            'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Accessories', 'Electronics'],
            'Price': [1200.0, 300.0, 1200.0, np.nan, 800.0, 300.0, 25.0, 1250.0],
            'Color': ['Silver', 'Black', 'Silver', 'Gray', 'Black', 'White', np.nan, 'Silver']}
new_df = pd.DataFrame(new_data)
print(new_df)

# 4.2. Handling Missing Values:
print("\n   2. Handling Missing Values:")

# Fill missing 'Price' with the median price
median_price = new_df['Price'].median()
new_df['Price'].fillna(median_price, inplace=True)
print("\nDataFrame with 'Price' NaN filled with median:")
print(new_df)

# Fill missing 'Color' with the most frequent color
mode_color = new_df['Color'].mode()[0]
new_df['Color'].fillna(mode_color, inplace=True)
print("\nDataFrame with 'Color' NaN filled with mode:")
print(new_df)

# 4.3. Remove Duplicates:
print("\n   3. Remove Duplicates:")

# Identify duplicates based on 'Product' and 'Category'
print("\nDuplicates based on 'Product' and 'Category':")
print(new_df[new_df.duplicated(subset=['Product', 'Category'])])

# Remove duplicates based on 'Product' and 'Category', keeping the first occurrence
new_df_cleaned = new_df.drop_duplicates(subset=['Product', 'Category'])
print("\nDataFrame with duplicates removed based on 'Product' and 'Category':")
print(new_df_cleaned)

Original DataFrame:
   ID     Name   Age      City
0   1    Alice  25.0  New York
1   2      Bob  30.0    London
2   3  Charlie   NaN     Paris
3   4      NaN  22.0    London
4   5      Eve  35.0     Tokyo
5   6      Bob  30.0    London
6   2      Bob  30.0    London
7   7    Frank   NaN    Berlin

2. Handling Missing Values

   1. Identify Missing Values:
      ID   Name    Age   City
0  False  False  False  False
1  False  False  False  False
2  False  False   True  False
3  False   True  False  False
4  False  False  False  False
5  False  False  False  False
6  False  False  False  False
7  False  False   True  False

Number of missing values per column:
ID      0
Name    1
Age     2
City    0
dtype: int64

Total number of missing values:
3

   2. Fill Missing Values:

DataFrame filled with 'Unknown':
   ID     Name      Age      City
0   1    Alice     25.0  New York
1   2      Bob     30.0    London
2   3  Charlie  Unknown     Paris
3   4  Unknown     22.0    London
4   5      Ev

In [3]:
# Part 2: Apply Standardization & Formatting Rules

#     Step-by-Step Guidelines:
# 1. Standardize Text Data
#     1. Convert All Names to Lowercase:
# 2. Format Numerical Data
#     1. Round Age Column to the Nearest Integer:
# 3. Combined Practice on Another Dataset
#     1. New Sample Data:
#     2. Standardize Product Names:
#     3. Format Prices to Two Decimal Places:
        
        
        
        
        

In [4]:
import pandas as pd
import numpy as np

# Sample DataFrame from Part 1 (after some cleaning)
data = {'ID': [1, 2, 3, 4, 5, 7],
        'Name': ['Alice', 'bob', 'Charlie', 'ANONYMOUS', 'Eve', 'Frank'],
        'Age': [25.0, 30.0, 28.5, 22.0, 35.0, 32.7],
        'City': ['New York', 'London', 'Paris', 'London', 'Tokyo', 'Berlin']}
df = pd.DataFrame(data)

print("Original DataFrame (from previous step):")
print(df)

# 1. Standardize Text Data
print("\n1. Standardize Text Data")

# 1.1. Convert All Names to Lowercase:
print("\n   1. Convert All Names to Lowercase:")
df['Name_Lower'] = df['Name'].str.lower()
print(df)

# You can replace the original 'Name' column if needed:
# df['Name'] = df['Name'].str.lower()
# print("\n'Name' column converted to lowercase:")
# print(df)

# 2. Format Numerical Data
print("\n2. Format Numerical Data")

# 2.1. Round Age Column to the Nearest Integer:
print("\n   1. Round Age Column to the Nearest Integer:")
df['Age_Rounded'] = df['Age'].round().astype(int)
print(df)

# You can replace the original 'Age' column if needed:
# df['Age'] = df['Age'].round().astype(int)
# print("\n'Age' column rounded to the nearest integer:")
# print(df)

# 3. Combined Practice on Another Dataset
print("\n3. Combined Practice on Another Dataset")

# 3.1. New Sample Data:
print("\n   1. New Sample Data:")
new_data = {'ProductID': [101, 102, 103, 104, 105],
            'ProductName': ['Laptop Pro X', 'Wireless Mouse', 'Smart keyboard', 'External HDD 1TB', 'USB Flash Drive 64GB'],
            'Price': [1299.99, 25.50, 79.00, 59.95, 12.75]}
new_df = pd.DataFrame(new_data)
print(new_df)

# 3.2. Standardize Product Names:
print("\n   2. Standardize Product Names:")
new_df['ProductName_Standardized'] = new_df['ProductName'].str.lower().str.replace(' ', '_')
print(new_df)

# You could also remove special characters or trim whitespace if necessary:
# new_df['ProductName_Cleaned'] = new_df['ProductName'].str.lower().str.replace('[^a-z0-9_]', '', regex=True).str.strip()
# print("\nProduct Names with special characters removed and stripped:")
# print(new_df)

# 3.3. Format Prices to Two Decimal Places:
print("\n   3. Format Prices to Two Decimal Places:")
new_df['Price_Formatted'] = new_df['Price'].map('{:.2f}'.format)
print(new_df)

# Note: Formatting with .map('{:.2f}'.format) converts the column to a string.
# If you need to perform numerical operations later, it's better to keep it as a float
# and only format for display.

# To keep it as a float formatted for display (not a new column):
# pd.options.display.float_format = '{:.2f}'.format
# print("\nDataFrame with Price formatted for display (float):")
# print(new_df)
# # Reset the display format if needed
# pd.options.display.float_format = None

Original DataFrame (from previous step):
   ID       Name   Age      City
0   1      Alice  25.0  New York
1   2        bob  30.0    London
2   3    Charlie  28.5     Paris
3   4  ANONYMOUS  22.0    London
4   5        Eve  35.0     Tokyo
5   7      Frank  32.7    Berlin

1. Standardize Text Data

   1. Convert All Names to Lowercase:
   ID       Name   Age      City Name_Lower
0   1      Alice  25.0  New York      alice
1   2        bob  30.0    London        bob
2   3    Charlie  28.5     Paris    charlie
3   4  ANONYMOUS  22.0    London  anonymous
4   5        Eve  35.0     Tokyo        eve
5   7      Frank  32.7    Berlin      frank

2. Format Numerical Data

   1. Round Age Column to the Nearest Integer:
   ID       Name   Age      City Name_Lower  Age_Rounded
0   1      Alice  25.0  New York      alice           25
1   2        bob  30.0    London        bob           30
2   3    Charlie  28.5     Paris    charlie           28
3   4  ANONYMOUS  22.0    London  anonymous          