In [1]:
# Activity 1: Handling Missing Data

# Task A: Dropping vs Imputation

# 1. Dropping Missing Data:
# - Load a dataset (e.g., a CSV file with some missing values like employees.csv ).
# - Inspect the dataset for missing values using a Python library (e.g., Pandas).
# - Drop rows with missing data and save the result.






# 2. Imputation using Mean:
# - Use the same dataset.
# - Fill missing numerical values with the column mean.
# - Save and display the modified data.









# 3. Imputation using Median and Mode:
# - For numerical columns, replace missing values with the median.
# - For categorical columns, use the mode.
# - Display the updated dataset.







In [None]:
import pandas as pd

# Load the dataset
try:
    df = pd.read_csv('employees.csv')
except FileNotFoundError:
    print("Error: 'employees.csv' not found. Please make sure the file is in the correct directory.")
    exit()

print("--- Original Dataset ---")
print(df)
print("\n--- Missing Values Information ---")
print(df.isnull().sum())

# 1. Dropping Missing Data
df_dropped = df.dropna()
print("\n--- Dataset After Dropping Rows with Missing Data ---")
print(df_dropped)
df_dropped.to_csv('employees_dropped.csv', index=False)
print("\nDataset with dropped rows saved to 'employees_dropped.csv'")

# 2. Imputation using Mean
df_mean_imputed = df.copy()
for col in df_mean_imputed.select_dtypes(include=['number']).columns:
    df_mean_imputed[col].fillna(df_mean_imputed[col].mean(), inplace=True)

print("\n--- Dataset After Imputation with Mean ---")
print(df_mean_imputed)
df_mean_imputed.to_csv('employees_mean_imputed.csv', index=False)
print("\nDataset with mean imputation saved to 'employees_mean_imputed.csv'")

# 3. Imputation using Median and Mode
df_median_mode_imputed = df.copy()

# Impute numerical columns with the median
for col in df_median_mode_imputed.select_dtypes(include=['number']).columns:
    df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].median(), inplace=True)

# Impute categorical columns with the mode
for col in df_median_mode_imputed.select_dtypes(include=['object']).columns:
    df_median_mode_imputed[col].fillna(df_median_mode_imputed[col].mode()[0], inplace=True)

print("\n--- Dataset After Imputation with Median and Mode ---")
print(df_median_mode_imputed)
df_median_mode_imputed.to_csv('employees_median_mode_imputed.csv', index=False)
print("\nDataset with median and mode imputation saved to 'employees_median_mode_imputed.csv'")

Error: 'employees.csv' not found. Please make sure the file is in the correct directory.
--- Original Dataset ---


NameError: name 'df' is not defined

: 

In [None]:
# Task B: Predictive Imputation

# 4. ML-based Imputation with Simple Imputer:
# - Use SimpleImputer from sklearn to fill missing values.
# - Choose a strategy (e.g., mean) and apply it to the dataset.





# 5. Imputation using a Regression Model:
# - Use a regression model to predict missing values.
# - Train the model on complete cases and fill the missing data.




# 6. K-Nearest Neighbors Imputation:
# - Use KNNImputer from sklearn .
# - Impute missing data based on neighbors' information.






In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset
try:
    df = pd.read_csv('employees.csv')
except FileNotFoundError:
    print("Error: 'employees.csv' not found. Please make sure the file is in the correct directory.")
    exit()

print("\n--- Original Dataset (for Predictive Imputation) ---")
print(df)
print("\n--- Missing Values Information ---")
print(df.isnull().sum())

# Make a copy to work with for imputation tasks
df_imputation = df.copy()

# 4. ML-based Imputation with SimpleImputer
print("\n--- 4. Imputation using SimpleImputer (Mean Strategy) ---")
imputer_mean = SimpleImputer(strategy='mean', numeric_only=True)
df_imputation['Age_mean_imputed'] = imputer_mean.fit_transform(df_imputation[['Age']])
df_imputation['Salary_mean_imputed'] = imputer_mean.fit_transform(df_imputation[['Salary']])
print(df_imputation[['Name', 'Age', 'Age_mean_imputed', 'Salary', 'Salary_mean_imputed']])

df_simple_imputer = df.copy()
df_simple_imputer[['Age', 'Salary']] = imputer_mean.fit_transform(df_simple_imputer[['Age', 'Salary']])
print("\n--- Dataset with SimpleImputer (Mean) ---")
print(df_simple_imputer)
df_simple_imputer.to_csv('employees_simple_mean_imputed.csv', index=False)
print("\nDataset with SimpleImputer (mean) saved to 'employees_simple_mean_imputed.csv'")

# 5. Imputation using a Regression Model
print("\n--- 5. Imputation using a Regression Model (Age) ---")
df_regression_imputed = df.copy()

# Prepare data for Age imputation (using Salary as a predictor)
known_age = df_regression_imputed.dropna(subset=['Age', 'Salary'])
unknown_age = df_regression_imputed[df_regression_imputed['Age'].isnull() & df_regression_imputed['Salary'].notnull()]

if not known_age.empty and not unknown_age.empty:
    X_train = known_age[['Salary']]
    y_train = known_age['Age']
    model_age = LinearRegression()
    model_age.fit(X_train, y_train)
    predicted_ages = model_age.predict(unknown_age[['Salary']])
    df_regression_imputed.loc[df_regression_imputed['Age'].isnull() & df_regression_imputed['Salary'].notnull(), 'Age'] = predicted_ages.round() # Round to whole numbers

print("\nDataset with Regression Imputation (Age based on Salary):")
print(df_regression_imputed[['Name', 'Age', 'Salary']])
df_regression_imputed.to_csv('employees_regression_imputed.csv', index=False)
print("\nDataset with regression imputation (Age) saved to 'employees_regression_imputed.csv'")

# 6. K-Nearest Neighbors Imputation
print("\n--- 6. K-Nearest Neighbors Imputation ---")
df_knn_imputed = df.copy()
imputer_knn = KNNImputer(n_neighbors=2) # Choose the number of neighbors
df_knn_imputed[['Age', 'Salary']] = imputer_knn.fit_transform(df_knn_imputed[['Age', 'Salary']])

print("\nDataset with KNN Imputation:")
print(df_knn_imputed)
df_knn_imputed.to_csv('employees_knn_imputed.csv', index=False)
print("\nDataset with KNN imputation saved to 'employees_knn_imputed.csv'")