In [None]:
# Importance of Data Cleaning

# 1. Missing Values: Missing data points in a dataset can lead to biased results.
#     Task 1: Load a dataset and identify which columns have missing values.
#     Task 2: Replace missing values in a dataset with the column mean or mode.
#     Task 3: Compare model performance with and without handling missing values.
    





In [None]:
# 2. Duplicate Data: Repeated data points can skew analysis and model results.
#     Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
#     Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
#     Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
    
    
    

In [None]:
# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.
#     Task 1: Convert a column of string numbers to integers in a dataset.
#     Task 2: Identify and correct columns with inconsistent data types in a dataset.
#     Task 3: Discuss why correct data types are critical for feature engineering.
    
    
    

In [None]:
# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.
#     Task 1: Visualize a dataset and identify outliers using a boxplot.
#     Task 2: Remove or adjust outliers and re-analyze the dataset.
#     Task 3: Research and report on a technique for handling outliers effectively.
    
    
    

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# ------------------------------
# Load Sample Dataset
# ------------------------------
# Simulated dataset with missing values, duplicates, incorrect types, and outliers
data = {
    'Age': [25, 30, 22, np.nan, 28, 30, 22, 100, 35, '40', 28],
    'Salary': [50000, 60000, 52000, 58000, np.nan, 60000, 52000, 1000000, 62000, 58000, 58000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female']
}
df = pd.DataFrame(data)

# Introduce a duplicate row
df = pd.concat([df, df.iloc[[1]]], ignore_index=True)

print("\nOriginal Dataset:")
print(df)

# -----------------------------------------------------
# 1. MISSING VALUES
# -----------------------------------------------------

print("\nMissing Values by Column:")
print(df.isnull().sum())

# Task 2: Replace missing values
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Convert Age to numeric
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Task 3: Compare model performance
df_model = df.copy()
df_model['Gender'] = df_model['Gender'].map({'Male': 0, 'Female': 1})
X = df_model[['Age', 'Gender']]
y = df_model['Salary']

# With missing values (before filling)
X_missing = X.copy()
X_missing.loc[3, 'Age'] = np.nan
X_missing.loc[4, 'Salary'] = np.nan

X1 = X_missing.dropna()
y1 = y.loc[X1.index]
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=1)
model1 = LinearRegression().fit(X_train1, y_train1)
pred1 = model1.predict(X_test1)
print("\nMSE without handling missing values:", mean_squared_error(y_test1, pred1))

# After handling missing values
X2 = X
y2 = y
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=1)
model2 = LinearRegression().fit(X_train2, y_train2)
pred2 = model2.predict(X_test2)
print("MSE after handling missing values:", mean_squared_error(y_test2, pred2))

# -----------------------------------------------------
# 2. DUPLICATE DATA
# -----------------------------------------------------

print("\nBefore removing duplicates:", df.shape)
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)

# Task 3: Explain duplication effect
print("\nExplanation: Duplicate data biases model learning toward repeated samples, causing overfitting and distorted predictions.")

# -----------------------------------------------------
# 3. INCORRECT DATA TYPES
# -----------------------------------------------------

print("\nData Types Before Correction:")
print(df.dtypes)

# Task 1 & 2: Convert string numbers to int
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Convert to float
df['Age'] = df['Age'].astype(int)

print("\nData Types After Correction:")
print(df.dtypes)

# Task 3: Explanation
print("\nCorrect data types ensure mathematical operations, encoding, and feature generation behave as expected.")

# -----------------------------------------------------
# 4. OUTLIERS & INCONSISTENCIES
# -----------------------------------------------------

# Task 1: Visualize with boxplot
plt.figure(figsize=(8, 4))
sns.boxplot(x=df['Salary'])
plt.title("Boxplot of Salary")
plt.show()

# Task 2: Remove outliers using IQR
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]
print("\nAfter removing outliers:", df.shape)

# Task 3: Research Report
print("\nTechnique: IQR (Interquartile Range) method effectively handles outliers by trimming values outside 1.5x IQR range. Other methods include Z-Score, Isolation Forest, or Winsorization.")

# Final cleaned dataset
print("\nFinal Cleaned Dataset:")
print(df)



Original Dataset:
    Age     Salary  Gender
0    25    50000.0    Male
1    30    60000.0  Female
2    22    52000.0  Female
3   NaN    58000.0    Male
4    28        NaN    Male
5    30    60000.0  Female
6    22    52000.0  Female
7   100  1000000.0    Male
8    35    62000.0  Female
9    40    58000.0  Female
10   28    58000.0  Female
11   30    60000.0  Female

Missing Values by Column:
Age       1
Salary    1
Gender    0
dtype: int64


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.