###      GUVI HCL Project 3

# Missing Data Cleaner

In [3]:
import pandas as pd
import os
import sys
from io import StringIO

def load_dataset_from_path():
    # Allowing user to enter valid path of Data Set untill user want to go back
    while True:
        file_path = input("Enter the path to your CSV file (e.g., 'C:/Users/YourName/data.csv') or type 'back' to return to main menu: ")
        if file_path.lower() == 'back':
            return None, "Returning to main menu."
        try:
            df = pd.read_csv(file_path)
            print()
            print(f"Dataset loaded successfully from {file_path}")
            return df, None
        except FileNotFoundError:
            print("Error: File not found. Please check the file path and try again.")
        except Exception as e:
            print(f"Error loading file: {e}")

def load_dataset_from_directory():
    # Selecting CSV files in current directory, excluding cleaned_data.csv
    csv_files = [f for f in os.listdir('.') if f.endswith('.csv') and f != 'cleaned_data.csv']
    if not csv_files:
        return None, '''Error: No CSV files found in the current directory.
        Place a CSV file like 'sample.csv' in the same folder as this script.'''
    
    file_path = sorted(csv_files)[0]
    try:
        df = pd.read_csv(file_path)
        print()
        print(f"Dataset loaded successfully from {file_path}")
        return df, None
    except FileNotFoundError:
        return None, "Error: File not found."
    except Exception as e:
        return None, f"Error loading file: {e}"

def load_dataset_from_terminal():
    # Collecting dataset input from terminal from user, first row as column names
    print()
    print()
    print("Enter dataset in CSV format (press Enter twice to finish).")
    print()
    print("First row will be considered as column names,")
    print("You can Enter any number of Column e.g., Name,Age,Salary,....    ")
    print()
    print("Then Second and further row will be considered as data rows,")
    print("Enter data row in same format as column e.g., 'Ravi,28, ...' or 'Meena,,45000...' or 'Kumar,30,50000...'")
    
    lines = []
    count = 1
    while True:
        line = input("Enter row no. " + str(count) + ": ")
        if line == "":
            break
        lines.append(line)
        count += 1
    if not lines:
        return None, "Error: No data entered. Please provide at least column names and one data row."
    try:
        csv_data = "\n".join(lines)
        df = pd.read_csv(StringIO(csv_data))
        print("Dataset loaded successfully from terminal input.")
        return df, None
    except Exception as e:
        return None, f"Error parsing terminal input: {e}. Ensure correct CSV format."

def display_missing_values(df):
    # Displaying count of missing values in each column
    missing_counts = df.isnull().sum()
    print("\nMissing values in each column:")
    print(missing_counts.to_string())
    return missing_counts

def fill_missing_values(df, method):
    # Filling missing values in numeric columns using specified method
    df_filled = df.copy()
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    fill_values = {}
    
    for column in numeric_columns:
        if method == 'mean':
            fill_value = df[column].mean()
        elif method == 'median':
            fill_value = df[column].median()
        elif method == 'mode':
            fill_value = df[column].mode()[0] if not df[column].mode().empty else None
        else:
            return df_filled, f"Invalid method: {method}. Choose 'mean', 'median', or 'mode'."
        if fill_value is not None:
            df_filled[column] = df_filled[column].fillna(fill_value)
            fill_values[column] = fill_value
    return df_filled, None, fill_values

def save_cleaned_dataset(df):
    # Saving cleaned dataset to cleaned_data.csv
    output_path = "cleaned_data.csv"
    try:
        df.to_csv(output_path, index=False)
        print(f"\nCleaned dataset saved to {output_path}")
    except Exception as e:
        print(f"Error saving file: {e}")

def main():
    # Main loop to handle user input and retries
    while True:
        print()
        print("Welcome to the Missing Data Cleaner!")
        print("This tool cleans missing values in a dataset using mean, median, or mode.")
        print("\nChoose how to provide your dataset:")
        print("1. Enter a CSV file path (e.g., 'sample.csv' or 'C:/Users/YourName/data.csv')")
        print("2. Use a CSV file from the current directory (e.g., 'sample.csv')")
        print("3. Enter data manually in the terminal")
        print("4. Exit")
        
        choice = input("Enter choice (1, 2, 3, or 4): ")
        
        if choice == '4':
            print("Exiting the program. Goodbye!")
            return
        
        # Loading dataset based on user choice
        df = None
        error = None
        if choice == '1':
            df, error = load_dataset_from_path()
        elif choice == '2':
            df, error = load_dataset_from_directory()
        elif choice == '3':
            df, error = load_dataset_from_terminal()
        else:
            print("Invalid choice! Please enter 1, 2, 3, or 4. Try again.")
            continue
        
        if df is None:
            print(error)
            print("Please try again or choose 4 to exit.")
            continue
        
        # Displaying dataset and missing values
        print("\nOriginal dataset:")
        print(df.to_string())
        missing_counts = display_missing_values(df)
        
        # Prompt for filling method for missing data
        while True:
            print("\nChoose a method to fill missing values (enter 'mean', 'median', or 'mode'):")
            print("Example: mean")
            method = input("Enter method: ").lower()
            if method not in ['mean', 'median', 'mode']:
                print("Invalid method! Please enter 'mean', 'median', or 'mode'. Try again.")
                continue
            
            # Filling missing values
            df_cleaned, fill_error, fill_values = fill_missing_values(df, method)
            if fill_error:
                print(fill_error)
                print("Please try again or choose 4 to exit from the main menu.")
                continue
            
            # Displaying cleaned dataset and fill values
            print("\nCleaned dataset:")
            print(df_cleaned.to_string())
            print("\nFill values used:")
            for column, value in fill_values.items():
                print(f"{column}: {value}")
            
            # Saving cleaned dataset
            save_cleaned_dataset(df_cleaned)
            print("=============================================================================")
            print("\nReturning to main menu.")
            break

if __name__ == "__main__":
    main()


Welcome to the Missing Data Cleaner!
This tool cleans missing values in a dataset using mean, median, or mode.

Choose how to provide your dataset:
1. Enter a CSV file path (e.g., 'sample.csv' or 'C:/Users/YourName/data.csv')
2. Use a CSV file from the current directory (e.g., 'sample.csv')
3. Enter data manually in the terminal
4. Exit
Enter choice (1, 2, 3, or 4): 2

Dataset loaded successfully from MissingData.csv

Original dataset:
      Name   Age   Salary  Height  Weight
0    Aryan  21.0  67000.0     NaN    70.0
1    Ayush  24.0  80000.0     5.5    75.0
2  Ratnesh  25.0      NaN     5.9    76.0
3    Rohit   NaN  90000.0     NaN    64.0
4      Ram   NaN      NaN     6.4    77.0
5   Satyam  27.0  99000.0     5.3     NaN
6   Aditya  20.0  65000.0     5.2    80.0
7   Raunak  21.0      NaN     6.0    55.0

Missing values in each column:
Name      0
Age       2
Salary    3
Height    2
Weight    1

Choose a method to fill missing values (enter 'mean', 'median', or 'mode'):
Example: mea

In [2]:
import pandas as pd
import os
import sys
from io import StringIO

def load_dataset_from_path():
    # Allowing user to enter valid path of Data Set untill user want to go back
    while True:
        file_path = input("Enter the path to your CSV file (e.g., 'C:/Users/YourName/data.csv') or type 'back' to return to main menu: ")
        if file_path.lower() == 'back':
            return None, "Returning to main menu."
        try:
            df = pd.read_csv(file_path)
            print()
            print(f"Dataset loaded successfully from {file_path}")
            return df, None
        except FileNotFoundError:
            print("Error: File not found. Please check the file path and try again.")
        except Exception as e:
            print(f"Error loading file: {e}")

def load_dataset_from_directory():
    # Selecting CSV files in current directory, excluding cleaned_data.csv
    csv_files = [f for f in os.listdir('.') if f.endswith('.csv') and f != 'cleaned_data.csv']
    if not csv_files:
        return None, '''Error: No CSV files found in the current directory.
        Place a CSV file like 'sample.csv' in the same folder as this script.'''
    
    file_path = sorted(csv_files)[0]
    try:
        df = pd.read_csv(file_path)
        print()
        print(f"Dataset loaded successfully from {file_path}")
        return df, None
    except FileNotFoundError:
        return None, "Error: File not found."
    except Exception as e:
        return None, f"Error loading file: {e}"

def load_dataset_from_terminal():
    # Collecting dataset input from terminal from user, first row as column names
    print()
    print()
    print("Enter dataset in CSV format (press Enter twice to finish).")
    print()
    print("First row will be considered as column names,")
    print("You can Enter any number of Column e.g., Name,Age,Salary,....    ")
    print()
    print("Then Second and further row will be considered as data rows,")
    print("Enter data row in same format as column e.g., 'Ravi,28, ...' or 'Meena,,45000...' or 'Kumar,30,50000...'")
    
    lines = []
    count = 1
    while True:
        line = input("Enter row no. " + str(count) + ": ")
        if line == "":
            break
        lines.append(line)
        count += 1
    if not lines:
        return None, "Error: No data entered. Please provide at least column names and one data row."
    try:
        csv_data = "\n".join(lines)
        df = pd.read_csv(StringIO(csv_data))
        print("Dataset loaded successfully from terminal input.")
        return df, None
    except Exception as e:
        return None, f"Error parsing terminal input: {e}. Ensure correct CSV format."

def display_missing_values(df):
    # Displaying count of missing values in each column
    missing_counts = df.isnull().sum()
    print("\nMissing values in each column:")
    print(missing_counts.to_string())
    return missing_counts

def fill_missing_values(df, method):
    # Filling missing values in numeric columns using specified method
    df_filled = df.copy()
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    fill_values = {}
    
    for column in numeric_columns:
        if method == 'mean':
            fill_value = df[column].mean()
        elif method == 'median':
            fill_value = df[column].median()
        elif method == 'mode':
            fill_value = df[column].mode()[0] if not df[column].mode().empty else None
        else:
            return df_filled, f"Invalid method: {method}. Choose 'mean', 'median', or 'mode'."
        if fill_value is not None:
            df_filled[column] = df_filled[column].fillna(fill_value)
            fill_values[column] = fill_value
    return df_filled, None, fill_values

def save_cleaned_dataset(df):
    # Saving cleaned dataset to cleaned_data.csv
    output_path = "cleaned_data.csv"
    try:
        df.to_csv(output_path, index=False)
        print(f"\nCleaned dataset saved to {output_path}")
    except Exception as e:
        print(f"Error saving file: {e}")

def main():
    # Main loop to handle user input and retries
    while True:
        print()
        print("Welcome to the Missing Data Cleaner!")
        print("This tool cleans missing values in a dataset using mean, median, or mode.")
        print("\nChoose how to provide your dataset:")
        print("1. Enter a CSV file path (e.g., 'sample.csv' or 'C:/Users/YourName/data.csv')")
        print("2. Use a CSV file from the current directory (e.g., 'sample.csv')")
        print("3. Enter data manually in the terminal")
        print("4. Exit")
        
        choice = input("Enter choice (1, 2, 3, or 4): ")
        
        if choice == '4':
            print("Exiting the program. Goodbye!")
            return
        
        # Loading dataset based on user choice
        df = None
        error = None
        if choice == '1':
            df, error = load_dataset_from_path()
        elif choice == '2':
            df, error = load_dataset_from_directory()
        elif choice == '3':
            df, error = load_dataset_from_terminal()
        else:
            print("Invalid choice! Please enter 1, 2, 3, or 4. Try again.")
            continue
        
        if df is None:
            print(error)
            print("Please try again or choose 4 to exit.")
            continue
        
        # Displaying dataset and missing values
        print("\nOriginal dataset:")
        print(df.to_string())
        missing_counts = display_missing_values(df)
        
        # Prompt for filling method for missing data
        while True:
            print("\nChoose a method to fill missing values (enter 'mean', 'median', or 'mode'):")
            print("Example: mean")
            method = input("Enter method: ").lower()
            if method not in ['mean', 'median', 'mode']:
                print("Invalid method! Please enter 'mean', 'median', or 'mode'. Try again.")
                continue
            
            # Filling missing values
            df_cleaned, fill_error, fill_values = fill_missing_values(df, method)
            if fill_error:
                print(fill_error)
                print("Please try again or choose 4 to exit from the main menu.")
                continue
            
            # Displaying cleaned dataset and fill values
            print("\nCleaned dataset:")
            print(df_cleaned.to_string())
            print("\nFill values used:")
            for column, value in fill_values.items():
                print(f"{column}: {value}")
            
            # Saving cleaned dataset
            save_cleaned_dataset(df_cleaned)
            print("=============================================================================")
            print("\nReturning to main menu.")
            break

if __name__ == "__main__":
    main()


Welcome to the Missing Data Cleaner!
This tool cleans missing values in a dataset using mean, median, or mode.

Choose how to provide your dataset:
1. Enter a CSV file path (e.g., 'sample.csv' or 'C:/Users/YourName/data.csv')
2. Use a CSV file from the current directory (e.g., 'sample.csv')
3. Enter data manually in the terminal
4. Exit
Enter choice (1, 2, 3, or 4): 3


Enter dataset in CSV format (press Enter twice to finish).

First row will be considered as column names,
You can Enter any number of Column e.g., Name,Age,Salary,....    

Then Second and further row will be considered as data rows,
Enter data row in same format as column e.g., 'Ravi,28, ...' or 'Meena,,45000...' or 'Kumar,30,50000...'
Enter row no. 1: Name,Age,Salary
Enter row no. 2: Ravi,28,
Enter row no. 3: Meena,,45000
Enter row no. 4: Kumar,30,50000
Enter row no. 5: 
Dataset loaded successfully from terminal input.

Original dataset:
    Name   Age   Salary
0   Ravi  28.0      NaN
1  Meena   NaN  45000.0
2  Kumar