In [None]:
# Part 1: Load a Dataset & Check Missing Values

# Task 1: Titanic Dataset
# - Load the dataset using Pandas.
# - Check for missing values in the 'Age' column.







# Part 2: Identify Duplicates & Inconsistencies

# Task 2: Duplicate Rows in Titanic Dataset
# - Identify any duplicate rows in the dataset.










# Part 3: Generate a Data Quality Report

# Task 3: Titanic Dataset Overview
# - Create a simple report of missing values, duplicates, and some basic statistics for the Titanic dataset.







In [1]:
import pandas as pd

def analyze_titanic_data(file_path="titanic.csv"):
    """
    Analyzes the Titanic dataset for missing values, duplicates,
    and generates a basic data quality report.

    Args:
        file_path (str, optional): Path to the Titanic dataset CSV file.
            Defaults to "titanic.csv".  If the file is not found,
            it will attempt to use a placeholder DataFrame.

    Returns:
        pandas.DataFrame: Returns the original dataframe. Prints a data quality report
        to the console.  Returns None if a critical error occurs.
    """
    try:
        # Task 1: Load the dataset using Pandas
        df = pd.read_csv(file_path)
        print(f"Loaded dataset from: {file_path}")  # Inform about the file loaded
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}.  Using placeholder data.")
        # Create a placeholder DataFrame with the specified columns
        df = pd.DataFrame({
            'PassengerId': [1, 2, 3, 4, 5],
            'Survived': [0, 1, 1, 0, 1],
            'Pclass': [3, 1, 3, 1, 3],
            'Name': ['Owen Harris Braund', 'Florence Briggs Thayer', 'John Edward Brown', 'William Henry Allen', 'James Moran'],
            'Sex': ['male', 'female', 'female', 'male', 'male'],
            'Age': [22.0, 38.0, 26.0, 35.0, None],
            'SibSp': [1, 1, 0, 0, 0],
            'Parch': [0, 0, 0, 0, 0],
            'Ticket': ['A/5 21151', 'PC 17599', 'STON/O2. 3101282', '373450', '330877'],
            'Fare': [7.25, 71.2833, 7.925, 8.05, 8.4583],
            'Cabin': [None, 'C85', None, None, None],
            'Embarked': ['S', 'C', 'S', 'S', 'Q']
        })

    # Check if the DataFrame is empty
    if df.empty:
        print("Error: The DataFrame is empty.  Please check the data source.")
        return None

    # --- Data Quality Analysis ---
    print("\n--- Data Quality Report for Titanic Dataset ---")

    # Task 1: Check for missing values in the 'Age' column.
    print("\nMissing Values in 'Age' column:")
    print(df['Age'].isnull().sum())

    # Task 2: Identify any duplicate rows in the dataset.
    print("\nDuplicate Rows:")
    duplicate_rows = df.duplicated().sum()
    print(duplicate_rows)

    # Task 3: Generate a basic statistics for the Titanic dataset.
    print("\nBasic Statistics:")
    print(df.describe())

    # Print the first few rows of the dataframe
    print("\nFirst 5 rows of the dataframe:")
    print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

    return df  # Return the DataFrame for further use if needed.

if __name__ == "__main__":
    # You can specify the file path if it's not the default "titanic.csv"
    titanic_df = analyze_titanic_data() # Uses default
    # titanic_df = analyze_titanic_data("path/to/your/titanic.csv")
    if titanic_df is not None:
        print("\nAnalysis Complete.")


Loaded dataset from: titanic.csv

--- Data Quality Report for Titanic Dataset ---

Missing Values in 'Age' column:
177

Duplicate Rows:
0

Basic Statistics:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   

ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.