In [None]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Iris Dataset
# - Load the dataset using Pandas.
# - Check for missing values in the entire dataset.








# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Inconsistent Entries in a Sample Dataset
# - Assume you have a dataset with a 'Gender' column. Identify inconsistent entries like 'M', 'Male', or 'male'.









# Part 3: Generate a Data Quality Report

# Task 3: Iris Dataset Summary
# - Generate basic descriptive statistics for the Iris dataset.






In [1]:
import pandas as pd
import numpy as np

def analyze_iris_data(file_path="iris-data.csv"):
    """
    Analyzes the Iris dataset for missing values, identifies
    inconsistent 'Gender' entries (in a sample dataset), and
    generates a basic data quality report.

    Args:
        file_path (str, optional): Path to the Iris dataset CSV file.
            Defaults to "iris.csv". If the file is not found, it will
            attempt to use a placeholder DataFrame.

    Returns:
        pandas.DataFrame: Returns the original Iris dataframe. Prints a data quality report
        to the console. Returns None on error.
    """
    try:
        # Task 1: Load the Iris dataset using Pandas
        df_iris = pd.read_csv(file_path)
        print(f"Loaded Iris dataset from: {file_path}")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Using placeholder data.")
        # Create placeholder Iris data.  Important to have some data for demonstration.
        df_iris = pd.DataFrame({
            'SepalLengthCm': [5.1, 4.9, 7.0, 6.4, 5.7],
            'SepalWidthCm': [3.5, 3.0, 3.2, 3.2, 3.8],
            'PetalLengthCm': [1.4, 1.4, 4.7, 4.5, 1.7],
            'PetalWidthCm': [0.2, 0.2, 1.4, 1.5, 0.4],
            'Species': ['Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa'],
            'Gender': ['Male', 'Female', 'M', 'F', 'Male'],  # Added for Task 2
            'Age': [20, 21, 22, 23, 24],
            'Score': [80, 90, 75, 85, 92],
        })

    # Check if the dataframe is empty
    if df_iris.empty:
        print("Error: The Iris DataFrame is empty.  Please check the data source.")
        return None

    # --- Data Quality Analysis for Iris Dataset ---
    print("\n--- Data Quality Report for Iris Dataset ---")

    # Task 1: Check for missing values in the entire Iris dataset.
    print("\nMissing Values in Iris Dataset:")
    print(df_iris.isnull().sum())

    # Task 2: Identify Inconsistent Entries
    print("\nInconsistent Entries in 'Gender' Column (Sample):")
    inconsistent_gender_entries = ['M', 'Male', 'male', 'F', 'Female', 'female']
    if 'Gender' in df_iris.columns:  # Check if 'Gender' column exists
        unique_gender_values = df_iris['Gender'].unique()
        inconsistent_entries = [entry for entry in unique_gender_values if entry in inconsistent_gender_entries] # corrected logic
        if inconsistent_entries:
            print(f"Inconsistent entries found: {inconsistent_entries}")
        else:
            print("No inconsistent gender entries found.")
    else:
        print("'Gender' column not found. Skipping inconsistency check.")

    # Task 3: Generate basic descriptive statistics for the Iris dataset.
    print("\nBasic Descriptive Statistics for Iris Dataset:")
    #  Only describe numeric columns
    numeric_df_iris = df_iris.select_dtypes(include=np.number)
    print(numeric_df_iris.describe())

    # Print the first few rows of the dataframe
    print("\nFirst 5 rows of the Iris dataframe:")
    print(df_iris.head().to_markdown(index=False, numalign="left", stralign="left"))
    return df_iris

if __name__ == "__main__":
    # You can specify the file path if it's not the default "iris.csv"
    iris_df = analyze_iris_data() # Uses default
    # iris_df = analyze_iris_data("path/to/your/iris.csv")
    if iris_df is not None:
        print("\nAnalysis Complete.")


Loaded Iris dataset from: iris-data.csv

--- Data Quality Report for Iris Dataset ---

Missing Values in Iris Dataset:
sepal length    0
sepal width     0
petal length    0
petal width     0
class           0
dtype: int64

Inconsistent Entries in 'Gender' Column (Sample):
'Gender' column not found. Skipping inconsistency check.

Basic Descriptive Statistics for Iris Dataset:
       sepal length  sepal width  petal length  petal width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

First 5 rows of the Iris dataframe:


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.