In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Set display options for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

In [7]:
import pandas as pd

# Original Google Drive sharing link
gdrive_url = 'https://drive.google.com/file/d/1MhjFdqO1ukPBjRZztBVbCjFB-VwSLC3H/view?usp=sharing'

# Extract the File ID from the URL
# The file ID is the part right after '/d/' and before '/view'
file_id = gdrive_url.split('/')[-2]

# Construct the direct download URL
download_url = f'https://drive.google.com/uc?export=download&id={file_id}'

# Load the data using the direct download URL
# Note: Ensure the file is set to 'Anyone with the link' can access it on Google Drive.
try:
    df = pd.read_csv(download_url)
    print("DataFrame loaded successfully!")
    print(df.head())
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    print("Please ensure your Google Drive file is set to 'Anyone with the link' access.")

DataFrame loaded successfully!
   _code        _sampling_date       state city_town_village_area  \
0    NaN  2014-07-01T12:48:04Z   Meghalaya                  Dawki   
1    NaN  2014-01-01T12:25:16Z         Goa                 Panaji   
2    NaN  2010-01-01T04:37:47Z  Chandigarh             Chandigarh   
3    NaN                   NaN  Chandigarh             Chandigarh   
4    NaN  2004-02-01T04:41:29Z       Assam               Guwahati   

                      location_of_monitoring_station  \
0  Terrace building, Dawki, Jaintia Hills Distric...   
1  Infront of Old GSPCB premises, Patto, Panaji, Goa   
2          Modern Foods, Industrial Area, Chandigarh   
3                                                NaN   
4                Head Office, Bamunimaidan, Guwahati   

                                    agency  \
0  Meghalaya State Pollution Control Board   
1        Goa State Pollution Control Board   
2   Chandigarh Pollution Control Committee   
3          Central Pollution Cont

In [8]:
# Display basic information about the dataset
print("=" * 80)
print("INITIAL DATA OVERVIEW")
print("=" * 80)
print(f"\nDataset Shape: {df.shape}")
print(f"Total Records: {df.shape[0]:,}")
print(f"Total Columns: {df.shape[1]}")

print("\n" + "=" * 80)
print("COLUMN NAMES AND DATA TYPES")
print("=" * 80)
print(df.dtypes)

print("\n" + "=" * 80)
print("FIRST 5 ROWS")
print("=" * 80)
print(df.head())

print("\n" + "=" * 80)
print("DATA CLEANING PROCESS")
print("=" * 80)


INITIAL DATA OVERVIEW

Dataset Shape: (435972, 11)
Total Records: 435,972
Total Columns: 11

COLUMN NAMES AND DATA TYPES
_code                             float64
_sampling_date                     object
state                              object
city_town_village_area             object
location_of_monitoring_station     object
agency                             object
type_of_location                   object
so2                               float64
no2                               float64
rspm_pm10                         float64
pm_2_5                            float64
dtype: object

FIRST 5 ROWS
   _code        _sampling_date       state city_town_village_area  \
0    NaN  2014-07-01T12:48:04Z   Meghalaya                  Dawki   
1    NaN  2014-01-01T12:25:16Z         Goa                 Panaji   
2    NaN  2010-01-01T04:37:47Z  Chandigarh             Chandigarh   
3    NaN                   NaN  Chandigarh             Chandigarh   
4    NaN  2004-02-01T04:41:29Z       Assam  

In [9]:
# Step 1: Handle 'NA' strings and convert to proper NaN
print("\n1. Converting 'NA' strings to NaN...")
df = df.replace('NA', np.nan)


1. Converting 'NA' strings to NaN...


In [12]:
# Step 2: Parse and clean _sampling_date column
print("\n2. Parsing and cleaning _sampling_date column...")

def parse_date(date_str):
    """Parse various date formats"""
    if pd.isna(date_str):
        return np.nan
    try:
        # Try ISO format with timezone
        return pd.to_datetime(date_str, format='%Y-%m-%dT%H:%M:%SZ')
    except:
        try:
            # Try without timezone
            return pd.to_datetime(date_str)
        except:
            return np.nan

df['_sampling_date'] = df['_sampling_date'].apply(parse_date)
print(f"   - Valid dates: {df['_sampling_date'].notna().sum():,}")
print(f"   - Missing dates: {df['_sampling_date'].isna().sum():,}")

# Extract date components into separate columns
print("\n3. Extracting date components...")
# Convert to Int64 (nullable integer) to handle NaN values properly
df['year'] = df['_sampling_date'].dt.year.astype('Int64')
df['month'] = df['_sampling_date'].dt.month.astype('Int64')  
df['day'] = df['_sampling_date'].dt.day.astype('Int64')
df['time'] = df['_sampling_date'].dt.time

# Optional: Also extract hour, minute, second if needed
df['hour'] = df['_sampling_date'].dt.hour.astype('Int64')
df['minute'] = df['_sampling_date'].dt.minute.astype('Int64')
df['second'] = df['_sampling_date'].dt.second.astype('Int64')

print(f"   - Date components extracted successfully")
print(f"   - Sample of extracted components:")
print(df[['_sampling_date', 'year', 'month', 'day', 'time', 'hour', 'minute', 'second']].head())

# Check data types
print(f"\n   - Data types:")
print(df[['year', 'month', 'day', 'time', 'hour', 'minute', 'second']].dtypes)


2. Parsing and cleaning _sampling_date column...
   - Valid dates: 386,992
   - Missing dates: 48,980

3. Extracting date components...
   - Date components extracted successfully
   - Sample of extracted components:
       _sampling_date  year  month   day      time  hour  minute  second
0 2014-07-01 12:48:04  2014      7     1  12:48:04    12      48       4
1 2014-01-01 12:25:16  2014      1     1  12:25:16    12      25      16
2 2010-01-01 04:37:47  2010      1     1  04:37:47     4      37      47
3                 NaT  <NA>   <NA>  <NA>       NaT  <NA>    <NA>    <NA>
4 2004-02-01 04:41:29  2004      2     1  04:41:29     4      41      29

   - Data types:
year       Int64
month      Int64
day        Int64
time      object
hour       Int64
minute     Int64
second     Int64
dtype: object


In [None]:
# Step 3: Convert pollutant columns to numeric
print("\n3. Converting pollutant columns to numeric...")
pollutant_cols = ['so2', 'no2', 'rspm_pm10', 'pm_2_5']
for col in pollutant_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        print(f"   - {col}: {df[col].notna().sum():,} valid values")


3. Converting pollutant columns to numeric...
   - so2: 401,305 valid values
   - no2: 418,605 valid values
   - rspm_pm10: 395,521 valid values
   - pm_2_5: 9,314 valid values


In [14]:
# Step 4: Clean text columns
print("\n4. Cleaning text columns...")
text_cols = ['state', 'city_town_village_area', 'location_of_monitoring_station', 
             'agency', 'type_of_location']
for col in text_cols:
    if col in df.columns:
        # Strip whitespace and convert to title case
        df[col] = df[col].str.strip()
        # Replace empty strings with NaN
        df[col] = df[col].replace('', np.nan)


4. Cleaning text columns...


In [15]:
# Step 5: Missing Values Analysis
print("\n" + "=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isna().sum(),
    'Missing_Percentage': (df.isna().sum() / len(df) * 100).round(2)
})
missing_df = missing_df.sort_values('Missing_Percentage', ascending=False)
print(missing_df.to_string(index=False))


MISSING VALUES ANALYSIS
                        Column  Missing_Count  Missing_Percentage
                         _code         435972              100.00
                        pm_2_5         426658               97.86
                        agency         149507               34.29
                        minute          48980               11.23
                          hour          48980               11.23
                          time          48980               11.23
                           day          48980               11.23
                         month          48980               11.23
                          year          48980               11.23
                        second          48980               11.23
                _sampling_date          48980               11.23
                     rspm_pm10          40451                9.28
                           so2          34667                7.95
location_of_monitoring_station          28330      

In [None]:
# Drop columns with high missing values
print("\n6. Dropping columns with excessive missing values...")

columns_to_drop = ['_code', 'agency']
print(f"   - Dropping columns: {columns_to_drop}")

# Check if columns exist before dropping
existing_columns = [col for col in columns_to_drop if col in df.columns]
missing_columns = [col for col in columns_to_drop if col not in df.columns]

if missing_columns:
    print(f"   - Warning: These columns were not found: {missing_columns}")

if existing_columns:
    # Store original shape
    original_shape = df.shape
    
    # Drop the columns
    df = df.drop(columns=existing_columns)
    
    print(f"   - Successfully dropped: {existing_columns}")
    print(f"   - Dataset shape: {original_shape} → {df.shape}")
    print(f"   - Columns removed: {len(existing_columns)}")
else:
    print("   - No columns to drop")

# Display remaining columns
print(f"\n   - Remaining columns ({len(df.columns)}): {list(df.columns)}")


4. Dropping columns with excessive missing values...
   - Dropping columns: ['_code', 'agency']
   - Successfully dropped: ['_code', 'agency']
   - Dataset shape: (435972, 18) → (435972, 16)
   - Columns removed: 2

   - Remaining columns (16): ['_sampling_date', 'state', 'city_town_village_area', 'location_of_monitoring_station', 'type_of_location', 'so2', 'no2', 'rspm_pm10', 'pm_2_5', 'year', 'month', 'day', 'time', 'hour', 'minute', 'second']


In [18]:
# Step 7: Data Quality Checks for Pollutants
print("\n" + "=" * 80)
print("POLLUTANT DATA QUALITY CHECKS")
print("=" * 80)

for col in pollutant_cols:
    if col in df.columns and df[col].notna().any():
        print(f"\n{col.upper()}:")
        print(f"  Count: {df[col].notna().sum():,}")
        print(f"  Mean: {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Min: {df[col].min():.2f}")
        print(f"  Max: {df[col].max():.2f}")
        print(f"  Std: {df[col].std():.2f}")
        
        # Check for negative values (invalid for pollutants)
        negative_count = (df[col] < 0).sum()
        if negative_count > 0:
            print(f"  ⚠️ Warning: {negative_count} negative values found!")
            df.loc[df[col] < 0, col] = 0
            print(f"     - Set negative values to 0")
        
        # Check for outliers using IQR method
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 3 * IQR
        upper_bound = Q3 + 3 * IQR
        outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        if outliers > 0:
            print(f"  ⚠️ Potential extreme outliers: {outliers} values")
            print(f"     - Outside range [{lower_bound:.2f}, {upper_bound:.2f}]")


POLLUTANT DATA QUALITY CHECKS

SO2:
  Count: 401,304
  Mean: 10.84
  Median: 8.00
  Min: 0.00
  Max: 909.00
  Std: 11.19
  ⚠️ Potential extreme outliers: 7234 values
     - Outside range [-21.10, 39.80]

NO2:
  Count: 418,604
  Mean: 25.82
  Median: 22.00
  Min: 0.00
  Max: 876.00
  Std: 18.53
  ⚠️ Potential extreme outliers: 5556 values
     - Outside range [-40.60, 86.80]

RSPM_PM10:
  Count: 395,521
  Mean: 108.83
  Median: 90.00
  Min: 0.00
  Max: 6307.03
  Std: 74.87
  ⚠️ Potential extreme outliers: 2304 values
     - Outside range [-202.00, 400.00]

PM_2_5:
  Count: 9,314
  Mean: 40.79
  Median: 32.00
  Min: 3.00
  Max: 504.00
  Std: 30.83
  ⚠️ Potential extreme outliers: 295 values
     - Outside range [-42.00, 112.00]


In [19]:
# Step 8: Date Range Analysis
print("\n" + "=" * 80)
print("DATE RANGE ANALYSIS")
print("=" * 80)
if df['_sampling_date'].notna().any():
    print(f"Earliest date: {df['_sampling_date'].min()}")
    print(f"Latest date: {df['_sampling_date'].max()}")
    print(f"Date range: {(df['_sampling_date'].max() - df['_sampling_date'].min()).days} days")
    
    # Extract year, month for temporal analysis
    df['year'] = df['_sampling_date'].dt.year
    df['month'] = df['_sampling_date'].dt.month
    df['day'] = df['_sampling_date'].dt.day
    
    print(f"\nYearly distribution:")
    year_counts = df['year'].value_counts().sort_index()
    for year, count in year_counts.items():
        if not pd.isna(year):
            print(f"  {int(year)}: {count:,} records")



DATE RANGE ANALYSIS
Earliest date: 1987-01-01 05:03:06
Latest date: 2015-12-31 12:24:25
Date range: 10591 days

Yearly distribution:
  1987: 428 records
  1988: 641 records
  2003: 2,556 records
  2004: 16,119 records
  2005: 19,336 records
  2006: 28,862 records
  2007: 34,376 records
  2008: 32,751 records
  2009: 28,749 records
  2010: 34,975 records
  2011: 37,641 records
  2012: 32,607 records
  2013: 45,803 records
  2014: 33,110 records
  2015: 39,038 records


In [21]:
# Step 9: Location Data Analysis
print("\n" + "=" * 80)
print("LOCATION DATA ANALYSIS")
print("=" * 80)
print(f"\nUnique States: {df['state'].nunique()}")
print(f"Unique Cities: {df['city_town_village_area'].nunique()}")
print(f"Unique Monitoring Stations: {df['location_of_monitoring_station'].nunique()}")
print(f"Unique Location Types: {df['type_of_location'].nunique()}")

print("\nTop 5 States by record count:")
state_counts = df['state'].value_counts().head()
for state, count in state_counts.items():
    print(f"  {state}: {count:,} records")


LOCATION DATA ANALYSIS

Unique States: 42
Unique Cities: 304
Unique Monitoring Stations: 989
Unique Location Types: 10

Top 5 States by record count:
  Maharashtra: 60,420 records
  Uttar Pradesh: 42,857 records
  Andhra Pradesh: 26,386 records
  Punjab: 25,649 records
  Rajasthan: 25,629 records


In [22]:
# Step 10: Create Cleaned Dataset
print("\n" + "=" * 80)
print("CREATING CLEANED DATASET")
print("=" * 80)


CREATING CLEANED DATASET


In [24]:
# Remove rows with missing critical fields
df_cleaned = df.copy()
critical_fields = ['_sampling_date', 'state', 'city_town_village_area']
for field in critical_fields:
    if field in df_cleaned.columns:
        missing_critical = df_cleaned[field].isna().sum()
        if missing_critical > 0:
            print(f"Removing {missing_critical:,} rows with missing {field}...")
            df_cleaned = df_cleaned[df_cleaned[field].notna()]

print(f"\nFinal cleaned dataset shape: {df_cleaned.shape}")
print(f"Rows removed: {len(df) - len(df_cleaned):,}")
print(f"Retention rate: {(len(df_cleaned) / len(df) * 100):.2f}%")

Removing 48,980 rows with missing _sampling_date...
Removing 1 rows with missing state...

Final cleaned dataset shape: (386991, 16)
Rows removed: 48,981
Retention rate: 88.77%


In [25]:
# Step 11: Save Cleaned Data
print("\n" + "=" * 80)
print("SAVING CLEANED DATA")
print("=" * 80)
output_file = 'air_quality_cleaned.csv'
df_cleaned.to_csv(output_file, index=False)
print(f"Cleaned data saved to: {output_file}")


SAVING CLEANED DATA
Cleaned data saved to: air_quality_cleaned.csv


In [26]:
# Create a summary report
print("\n" + "=" * 80)
print("CLEANING SUMMARY REPORT")
print("=" * 80)
print(f"""
Data Cleaning Summary:
----------------------
Original Records: {len(df):,}
Cleaned Records: {len(df_cleaned):,}
Records Removed: {len(df) - len(df_cleaned):,}

Data Quality Improvements:
- Converted 'NA' strings to proper NaN values
- Parsed and standardized date formats
- Converted pollutant columns to numeric type
- Removed negative pollutant values
- Removed duplicate records
- Removed rows with all missing pollutant data
- Cleaned text fields (trimmed whitespace)
- Added temporal columns (year, month, day)

Next Steps:
1. Review outliers in pollutant data
2. Consider imputation strategies for missing values
3. Validate location data consistency
4. Calculate AQI based on cleaned pollutant values
""")


CLEANING SUMMARY REPORT

Data Cleaning Summary:
----------------------
Original Records: 435,972
Cleaned Records: 386,991
Records Removed: 48,981

Data Quality Improvements:
- Converted 'NA' strings to proper NaN values
- Parsed and standardized date formats
- Converted pollutant columns to numeric type
- Removed negative pollutant values
- Removed duplicate records
- Removed rows with all missing pollutant data
- Cleaned text fields (trimmed whitespace)
- Added temporal columns (year, month, day)

Next Steps:
1. Review outliers in pollutant data
2. Consider imputation strategies for missing values
3. Validate location data consistency
4. Calculate AQI based on cleaned pollutant values

