In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('/content/Synthetic_Dataset.csv')

In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
import datetime

# 1. Remove Duplicate Rows
df = df.drop_duplicates()
print(f"Duplicates removed: {df.duplicated().sum()}")

# 2. Handling Missing Values
# Numerical: Fill with median, Categorical: Fill with mode
# Identify datetime columns first
date_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns]']
num_cols = df.select_dtypes(include=[np.number]).columns
# Exclude datetime columns from numerical columns
num_cols = [col for col in num_cols if col not in date_cols]
cat_cols = df.select_dtypes(include=[object]).columns
#Check for object columns that are strings, or mixed types.
string_like_cols = [col for col in df.columns if (df[col].dtype == 'object') and (df[col].apply(lambda x: isinstance(x,str)).any())]
cat_cols = list(set(list(cat_cols) + list(string_like_cols)))

num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Check if there are actually any categorical columns before imputing.
if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])


print("Missing values after imputation:", df.isnull().sum().sum())

# 3. Convert Data Types (e.g., DateTime)
for col in df.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce')

# 4. Handling Inconsistent Data (Trim spaces, fix typos)
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# 5. Outlier Detection & Handling (IQR method)
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in num_cols:
    df = remove_outliers_iqr(df, col)

# 6. Encoding Categorical Variables
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# 7. Feature Scaling (Standardization)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 8. Save Cleaned Data
df.to_csv('cleaned_data.csv', index=False)
print("Data cleaning complete. Cleaned file saved as 'cleaned_data.csv'.")

Duplicates removed: 0
Missing values after imputation: 0
Data cleaning complete. Cleaned file saved as 'cleaned_data.csv'.
