### Garbage In, Garbage Out (GIGO): Cleaning Missing Data
**Description**: Load a dataset (e.g., Titanic dataset) and identify missing values. Use
appropriate techniques to handle these missing values.

In [None]:
# Write your code from here
https://github.com/amruthareddy-zenV/AI_DATA_ANALYSIS_

In [1]:
import pandas as pd
import numpy as np

# Define the URL for the raw Titanic train.csv dataset
# This URL points to a publicly available Titanic dataset on GitHub.
titanic_url = "https://raw.githubusercontent.com/dsindy/kaggle-titanic/master/data/train.csv"

print("Loading the Titanic dataset...")
try:
    # Load the dataset into a pandas DataFrame
    df = pd.read_csv(titanic_url)
    print("Dataset loaded successfully.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())

    print("\n--- Identifying Missing Values ---")
    # Calculate the number of missing values for each column
    missing_values = df.isnull().sum()
    # Calculate the percentage of missing values for each column
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # Create a DataFrame to display missing values count and percentage
    missing_info = pd.DataFrame({
        'Missing Count': missing_values,
        'Missing Percentage (%)': missing_percentage
    })
    # Filter to show only columns with missing values
    missing_info = missing_info[missing_info['Missing Count'] > 0].sort_values(
        by='Missing Count', ascending=False
    )

    if not missing_info.empty:
        print("Columns with missing values:")
        print(missing_info)
    else:
        print("No missing values found in the dataset.")

    print("\n--- Handling Missing Values ---")

    # Strategy 1: Impute 'Age' with the median
    # 'Age' is a numerical column. Median is preferred over mean for skewed distributions.
    if 'Age' in df.columns and df['Age'].isnull().any():
        median_age = df['Age'].median()
        df['Age'].fillna(median_age, inplace=True)
        print(f"Missing 'Age' values imputed with the median: {median_age:.2f}")
    else:
        print("'Age' column not found or has no missing values.")

    # Strategy 2: Impute 'Embarked' with the mode (most frequent value)
    # 'Embarked' is a categorical column. Mode is suitable for categorical imputation.
    if 'Embarked' in df.columns and df['Embarked'].isnull().any():
        mode_embarked = df['Embarked'].mode()[0] # .mode() can return multiple if ties, so take the first
        df['Embarked'].fillna(mode_embarked, inplace=True)
        print(f"Missing 'Embarked' values imputed with the mode: '{mode_embarked}'")
    else:
        print("'Embarked' column not found or has no missing values.")

    # Strategy 3: Drop 'Cabin' column due to a high percentage of missing values
    # 'Cabin' often has too many missing values to be meaningfully imputed.
    if 'Cabin' in df.columns:
        if df['Cabin'].isnull().sum() / len(df) > 0.7: # Example threshold for dropping
            df.drop('Cabin', axis=1, inplace=True)
            print("Dropped 'Cabin' column due to a high percentage of missing values.")
        else:
            print("'Cabin' column exists but does not meet the threshold for dropping based on missing values.")
    else:
        print("'Cabin' column not found.")

    print("\n--- Verifying Missing Values After Cleaning ---")
    # Verify the remaining missing values
    missing_values_after_cleaning = df.isnull().sum()
    missing_percentage_after_cleaning = (df.isnull().sum() / len(df)) * 100

    missing_info_after_cleaning = pd.DataFrame({
        'Missing Count': missing_values_after_cleaning,
        'Missing Percentage (%)': missing_percentage_after_cleaning
    })
    missing_info_after_cleaning = missing_info_after_cleaning[
        missing_info_after_cleaning['Missing Count'] > 0
    ].sort_values(by='Missing Count', ascending=False)

    if not missing_info_after_cleaning.empty:
        print("Columns with remaining missing values:")
        print(missing_info_after_cleaning)
    else:
        print("All specified missing values have been handled.")

    print("\nCleaned dataset info:")
    df.info()

except Exception as e:
    print(f"An error occurred: {e}")


Loading the Titanic dataset...
Dataset loaded successfully.

First 5 rows of the dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0          