In [None]:
# Task 1: Load a CSV Dataset
# Description: Load a CSV file into a Pandas DataFrame and print the first five rows to understand the structure of the dataset.





In [None]:
# Task 2: Check for Missing Values
# Description: Identify and list the columns with missing values and the number of missing values in each.



In [None]:
# Task 3: Visualize Missing Data
# Description: Use a heatmap to visualize the missing values in the dataset.





In [None]:
# Task 4: Remove Columns with Many Missing Values
# Description: Drop columns that have more than 50% missing values.




In [None]:
# Task 5: Identify Duplicate Rows
# Description: Check for and display any duplicate rows in the dataset.




In [None]:
# Task 6: Remove Duplicate Rows
# Description: Remove duplicate rows from the dataset and verify that they have been removed.




In [None]:
# Task 7: Check Data Inconsistencies
# Description: Identify inconsistencies in categorical columns, such as differing text cases or trailing spaces.




In [None]:
# Task 8: Get Summary of Data Quality
# Description: Generate a summary of data quality including total records, number of duplicate rows, and columns with missing values.




In [None]:
# Task 9: Generate a Data Quality Report
# Description: Create a comprehensive data quality report that includes not only missing values but also basic statistics for numerical columns and the distribution of categorical columns.




In [None]:
# Task 10: Advanced Data Imputation
# Description: Perform advanced data imputation by replacing missing values in numerical columns with the mean and categorical columns with the mode.





In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#suppress warnings
warnings.filterwarnings('ignore')

def load_dataset(file_path):
    """
    Loads a CSV file into a Pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pandas.DataFrame: The loaded DataFrame, or None if an error occurs.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset successfully loaded from {file_path}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")
        return None

def check_missing_values(df):
    """
    Checks for missing values in the DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.Series: A Series containing the count of missing values for each column,
                    or None if the input is not a DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return None

    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]  # Filter out columns with no missing values
    if missing_values.empty:
        print("No missing values found in the dataset.")
        return None
    else:
        print("Columns with missing values:")
        print(missing_values)
        return missing_values

def visualize_missing_data(df):
    """
    Visualizes missing data using a heatmap.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame. Cannot visualize.")
        return

    if df.isnull().sum().sum() == 0:
        print("No missing data to visualize.")
        return

    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Data Heatmap')
    plt.show()

def remove_columns_with_many_missing_values(df, threshold=0.5):
    """
    Removes columns with a high percentage of missing values.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    threshold (float): The threshold for the percentage of missing values (0 to 1).
                     Columns with missing values above this threshold will be dropped.
                     Defaults to 0.5 (50%).

    Returns:
    pandas.DataFrame: The DataFrame with columns removed, or None if input is not a DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return None

    if not 0 <= threshold <= 1:
        print("Error: Threshold must be between 0 and 1.")
        return df  # Return original DataFrame if threshold is invalid

    missing_percentage = df.isnull().sum() / len(df)
    columns_to_drop = missing_percentage[missing_percentage > threshold].index
    if columns_to_drop.empty:
        print("No columns found with missing values exceeding the threshold.")
        return df
    else:
        df_dropped = df.drop(columns=columns_to_drop, axis=1)
        print(f"Dropped columns: {list(columns_to_drop)}")
        print("DataFrame shape after dropping columns:", df_dropped.shape)
        return df_dropped

def check_duplicate_rows(df):
    """
    Checks for duplicate rows in the DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.DataFrame: A DataFrame containing the duplicate rows, or None if input is not a DataFrame.
                     Returns an empty DataFrame if no duplicates are found.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return None

    duplicate_rows = df[df.duplicated()]
    if duplicate_rows.empty:
        print("No duplicate rows found.")
        return duplicate_rows  # Return empty DataFrame
    else:
        print("Duplicate rows:")
        print(duplicate_rows)
        return duplicate_rows

def remove_duplicate_rows(df):
    """
    Removes duplicate rows from the DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.DataFrame: The DataFrame with duplicate rows removed, or None if the input is not a DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return None

    df_no_duplicates = df.drop_duplicates()
    if df_no_duplicates.shape[0] < df.shape[0]:
        print(f"Removed {df.shape[0] - df_no_duplicates.shape[0]} duplicate rows.")
        print("DataFrame shape after removing duplicates:", df_no_duplicates.shape)
        return df_no_duplicates
    else:
        print("No duplicate rows to remove.")
        return df_no_duplicates

def check_data_inconsistencies(df):
    """
    Checks for inconsistencies in categorical columns.  Specifically, it checks for
    leading/trailing whitespace and different capitalization.  It prints the unique
    values of each categorical column after stripping whitespace and converting
    to lowercase, if any inconsistencies are found.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return

    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if categorical_cols.empty:
        print("No categorical columns found.")
        return

    inconsistencies_found = False
    for col in categorical_cols:
        # Create a temporary version of the column with cleaned data for comparison
        cleaned_values = df[col].astype(str).str.strip().str.lower()
        unique_values = cleaned_values.unique()

        # Check if the number of unique values in the cleaned version is less than
        # the number of unique values in the original column.  If they are different,
        # then there were inconsistencies.
        if len(unique_values) < len(df[col].unique()):
            inconsistencies_found = True
            print(f"Inconsistencies found in column '{col}':")
            print(f"  Unique values after cleaning: {unique_values}")

    if not inconsistencies_found:
        print("No inconsistencies found in categorical columns.")



def get_summary_of_data_quality(df):
    """
    Generates a summary of data quality.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return

    total_records = len(df)
    duplicate_rows = df[df.duplicated()].shape[0]
    missing_values = df.isnull().sum()
    cols_with_missing_values = missing_values[missing_values > 0].index.tolist()

    print("Data Quality Summary:")
    print(f"  Total number of records: {total_records}")
    print(f"  Number of duplicate rows: {duplicate_rows}")
    if cols_with_missing_values:
        print("  Columns with missing values:")
        for col in cols_with_missing_values:
            print(f"    {col}: {missing_values[col]} missing values")
    else:
        print("  No columns with missing values.")

def generate_data_quality_report(df):
    """
    Generates a comprehensive data quality report.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return

    print("Data Quality Report")
    print("-----------------------")

    # Basic information
    print(f"Total number of records: {len(df)}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"Column names: {df.columns.tolist()}")
    print(f"Data types:\n{df.dtypes}")
    print("\n")

    # Missing values
    missing_values = df.isnull().sum()
    total_missing = missing_values.sum()
    print(f"Total missing values: {total_missing}")
    if total_missing > 0:
        print("Missing values per column:")
        print(missing_values[missing_values > 0])
        print("\n")

    # Duplicate rows
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicate_count}")
    if duplicate_count > 0:
        print("First 5 duplicate rows:")
        print(df[df.duplicated()].head().to_markdown(index=False, numalign="left", stralign="left"))
        print("\n")

    # Statistics for numerical columns
    numerical_cols = df.select_dtypes(include=['number']).columns
    if not numerical_cols.empty:
        print("Statistics for numerical columns:")
        print(df[numerical_cols].describe().to_markdown(numalign="left", stralign="left"))
        print("\n")

    # Distribution of categorical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if not categorical_cols.empty:
        print("Distribution of categorical columns:")
        for col in categorical_cols:
            print(f"\nColumn: {col}")
            print(df[col].value_counts().to_markdown(numalign="left", stralign="left"))

def advanced_data_imputation(df):
    """
    Performs advanced data imputation by replacing missing values in numerical
    columns with the mean and categorical columns with the mode.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.

    Returns:
    pandas.DataFrame: The DataFrame with imputed values, or None if the input is not a DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        print("Error: Input is not a Pandas DataFrame.")
        return None

    df_imputed = df.copy() # Create a copy to avoid modifying the original DataFrame in place.

    for col in df_imputed.columns:
        if pd.api.types.is_numeric_dtype(df_imputed[col]):
            # Impute numerical columns with the mean
            df_imputed[col] = df_imputed[col].fillna(df_imputed[col].mean())
            print(f"Imputed missing values in numerical column '{col}' with the mean.")
        elif pd.api.types.is_categorical_dtype(df_imputed[col]) or pd.api.types.is_object_dtype(df_imputed[col]):
            # Impute categorical/object columns with the mode
            mode_val = df_imputed[col].mode()[0]  # Get the first mode in case of ties
            df_imputed[col] = df_imputed[col].fillna(mode_val)
            print(f"Imputed missing values in categorical/object column '{col}' with the mode: {mode_val}")

    print("Advanced data imputation complete.")
    return df_imputed

def main():
    """
    Main function to execute the data quality analysis tasks.
    """
    # Load the dataset
    file_path = 'swiggy.csv'  # Use the correct file path
    df = load_dataset(file_path)

    if df is None:
        return  # Exit if the dataset failed to load

    # Perform data quality checks and transformations
    check_missing_values(df)
    visualize_missing_data(df)
    df = remove_columns_with_many_missing_values(df)
    check_duplicate_rows(df)
    df = remove_duplicate_rows(df)
    check_data_inconsistencies(df)
    get_summary_of_data_quality(df)
    generate_data_quality_report(df)
    df = advanced_data_imputation(df) # Impute missing values

    # Print the first 5 rows of the cleaned and imputed DataFrame
    print("\nFirst 5 rows of the cleaned and imputed DataFrame:")
    print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

if __name__ == "__main__":
    main()


Dataset successfully loaded from swiggy.csv
No missing values found in the dataset.
No missing data to visualize.
No columns found with missing values exceeding the threshold.
No duplicate rows found.
No duplicate rows to remove.
Inconsistencies found in column 'Area':
  Unique values after cleaning: ['koramangala' 'jogupalya' 'indiranagar' 'domlur' 'cooke town'
 'pulikeshi nagar' 'sivanchetti gardens' 'kodihalli' 'jayanagar'
 'film nagar' 'banashankari' 'masab tank' 'banjara hills' 'andheri east'
 'powai' 'punjagutta' 'aundh' 'baner' 'powai area' 'ramgopalpet'
 'kalasiguda' 'adarsh nagar' 'himayatnagar' 'ashok nagar'
 'commercial street' 'richmond town' 'vasanth nagar' 'bhowanipore'
 'ballygunge' 'gariahat' 'kalighat' 'sion' 'mumbai' 'rajajinagar'
 'golpark' 'shivajinagar' 'koregaon park' 'deccan gymkhana' 'karkhana'
 'kothrud' 'erandwane' 'koti' 'dilsukhnagar' 'nagole' 'chandrapuri colony'
 'kothapet' 'narayanguda' 'fc road' 'park street area' 'beniapukur'
 'bidhannagar' 'new nallaku

ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.