In [13]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Eve', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 'thirty', 22, 35, 24],
    'Score': [88.0, np.nan, np.nan, 105.0, 88.0],
    'Date': ['2023-01-05', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
      Name     Age  Score        Date
0      Eve      24   88.0  2023-01-05
1      Bob  thirty    NaN  2023-01-02
2  Charlie      22    NaN  2023-01-03
3    David      35  105.0  2023-01-04
4      Eve      24   88.0  2023-01-05


### Dirty Data: An Opportunity for Cleaning Up Bias in AI - [Read Article](https://news.ucsb.edu/2024/021521/dirty-data-opportunity-cleaning-bias-ai)
> “No one had studied the fairness aspect of imputation before, which was surprising because missing data is such a prevalent problem in the real world,” she said. “Nearly all of the research at the time centered around developing better training algorithms to eliminate bias, but not many people thought about addressing the bias that happened during data collection.”

In [14]:
# Data cleaning is important for AI projects to ensure integrity and accuracy.

# Identify Missing Values
print(df.isnull().sum())

Name     0
Age      0
Score    2
Date     0
dtype: int64


In [15]:
# Remove missing values
df_cleaned = df.dropna(subset=['Score'])
print("\nDataFrame after removing rows with missing 'Score':")
print(df_cleaned)

# Removing missing values avoids distorting the analysis and ensures that the model is trained on complete data.


DataFrame after removing rows with missing 'Score':
    Name Age  Score        Date
0    Eve  24   88.0  2023-01-05
3  David  35  105.0  2023-01-04
4    Eve  24   88.0  2023-01-05


In [16]:
# Replace missing 'Score' with the mean
mean_score = df['Score'].mean()
print("\nMean Score:", mean_score)

df.fillna({'Score': mean_score}, inplace=True)
print("\nDataFrame after replacing missing 'Score' with mean:")
print(df)


Mean Score: 93.66666666666667

DataFrame after replacing missing 'Score' with mean:
      Name     Age       Score        Date
0      Eve      24   88.000000  2023-01-05
1      Bob  thirty   93.666667  2023-01-02
2  Charlie      22   93.666667  2023-01-03
3    David      35  105.000000  2023-01-04
4      Eve      24   88.000000  2023-01-05


In [17]:
# Check for duplicates
print(df.duplicated().sum())

# Remove duplicate rows on the 'Name' column
df = df.drop_duplicates(subset='Name')
print("\nDataFrame after removing duplicates:")
print(df)

# Duplicates can occur due to data entry errors or merging datasets.

1

DataFrame after removing duplicates:
      Name     Age       Score        Date
0      Eve      24   88.000000  2023-01-05
1      Bob  thirty   93.666667  2023-01-02
2  Charlie      22   93.666667  2023-01-03
3    David      35  105.000000  2023-01-04


In [18]:
# Check data types
print(df.dtypes)

Name      object
Age       object
Score    float64
Date      object
dtype: object


In [19]:
# Convert 'Age' to numeric, forcing errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Fill missing 'Age' values with the median
median_age = df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)

In [20]:
# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])
print("\nDataFrame after converting 'Date' to datetime:")
print(df)

# Correct data types are important for AI model training because they ensure that the data is interpreted correctly.


DataFrame after converting 'Date' to datetime:
      Name   Age       Score       Date
0      Eve  24.0   88.000000 2023-01-05
1      Bob  24.0   93.666667 2023-01-02
2  Charlie  22.0   93.666667 2023-01-03
3    David  35.0  105.000000 2023-01-04


In [21]:
# Handling outliers
outliers = df[df['Score'] > 100]
print("Outliers:")
print(outliers)

Outliers:
    Name   Age  Score       Date
3  David  35.0  105.0 2023-01-04


In [None]:
# Cap scores at 100
df.loc[df['Score'] > 100, 'Score'] = 100

# Round scores
df['Score'] = df['Score'].round(1)

print("\nDataFrame after capping 'Score' at 100:")
print(df)

# Outliers affect machine learning models by skewing the results, so handling them is crucial.
# For example, capping scores prevents extreme values from disproportionately influencing the model.
# However, it is also important to analyze why outliers exist, as they may represent valid extreme cases.


DataFrame after capping 'Score' at 100:
      Name   Age  Score       Date
0      Eve  24.0   88.0 2023-01-05
1      Bob  24.0   93.7 2023-01-02
2  Charlie  22.0   93.7 2023-01-03
3    David  35.0  100.0 2023-01-04
