In [None]:
import pandas as pd
import numpy as np
import time
import os
import random

def data_cleaning_master(data_path, data_name):
    print("Thank you for providing the details!")
    
    sec = random.randint(1, 4)
    print(f"Please wait for {sec} seconds! Checking file path")
    time.sleep(sec)
    
    if not os.path.exists(data_path):
        print("Incorrect path! Try again with the correct path.")
        return None, None  # Returning None to handle the case gracefully.
    
    try:
        if data_path.endswith('.csv'):
            print('Dataset is CSV!')
            data = pd.read_csv(data_path, encoding_errors='ignore')
            
        elif data_path.endswith('.xlsx'):
            print('Dataset is Excel file!')
            data = pd.read_excel(data_path)
            
        else:
            print("Unknown file type!")
            return None, None  # Return None for both duplicates and cleaned data in case of an unknown file type.
    except Exception as e:
        print(f"Error reading file: {e}")
        return None, None  # Return None in case of errors reading the file.
    
    sec = random.randint(1, 4)
    print(f"Please wait for {sec} seconds! Checking total columns and rows")
    time.sleep(sec)
    
    print(f"Dataset contains total rows: {data.shape[0]} \nTotal Columns: {data.shape[1]}")
    
    sec = random.randint(1, 4)
    print(f"Please wait for {sec} seconds! Checking for duplicates")
    time.sleep(sec)
    
    duplicates = data.duplicated()
    total_duplicate = duplicates.sum()
    print(f"Dataset contains total duplicate records: {total_duplicate}")
    
    sec = random.randint(1, 4)
    print(f"Please wait for {sec} seconds! Saving total duplicates rows")
    time.sleep(sec)
    
    if total_duplicate > 0:
        duplicate_records = data[duplicates]
        duplicate_records.to_csv(f'{data_name}_duplicates.csv', index=None)
    
    df = data.drop_duplicates()
    
    sec = random.randint(1, 10)
    print(f"Please wait for {sec} seconds! Checking for missing values")
    time.sleep(sec)
    
    total_missing_value = df.isnull().sum().sum()
    missing_value_by_column = df.isnull().sum()
    
    print(f"Dataset has total missing values: {total_missing_value}")
    print(f"Missing values by columns: \n{missing_value_by_column}")
    
    sec = random.randint(1, 6)
    print(f"Please wait for {sec} seconds! Cleaning dataset")
    time.sleep(sec)
    
    columns = df.columns
    
    for col in columns:
        if df[col].dtype in (float, int):
            df[col] = df[col].fillna(df[col].mean())
        else:
            df.dropna(subset=[col], inplace=True)
    
    sec = random.randint(1, 5)
    print(f"Please wait for {sec} seconds! Exporting cleaned dataset")
    time.sleep(sec)
    
    print(f"Congrats! Dataset is cleaned! \nNumber of Rows: {df.shape[0]} Number of Columns: {df.shape[1]}")
    
    df.to_csv(f'{data_name}_Clean_data.csv', index=None)
    print("Dataset is saved!")
    
    return duplicate_records, df  # Return cleaned data and duplicates.

if __name__ == "__main__":
    print("Welcome to Data Cleaning Master!")
    
    data_path = input("Please enter the dataset path: ")
    data_name = input("Please enter the dataset name: ")
    
    duplicates, clean_data = data_cleaning_master(data_path, data_name)
    
    if duplicates is not None and clean_data is not None:
        print("Data cleaning completed successfully!")
    else:
        print("Data cleaning failed due to an error.")


Welcome to Data Cleaning Master!
