In [2]:
import pandas as pd
import numpy as np

Identify Missing Values using Pandas

In [3]:
df=pd.read_csv("GlobalWeatherRepository.csv")
df.isnull().sum()

country                         0
location_name                   0
latitude                        0
longitude                       0
timezone                        0
last_updated_epoch              0
last_updated                    0
temperature_celsius             0
temperature_fahrenheit          0
condition_text                  0
wind_mph                        0
wind_kph                        0
wind_degree                     0
wind_direction                  0
pressure_mb                     0
pressure_in                     0
precip_mm                       0
precip_in                       0
humidity                        0
cloud                           0
feels_like_celsius              0
feels_like_fahrenheit           0
visibility_km                   0
visibility_miles                0
uv_index                        0
gust_mph                        0
gust_kph                        0
air_quality_Carbon_Monoxide     0
air_quality_Ozone               0
air_quality_Ni

Percentage of missing values

In [None]:
(df.isnull().sum()/len(df) ) * 100

country                         0.0
location_name                   0.0
latitude                        0.0
longitude                       0.0
timezone                        0.0
last_updated_epoch              0.0
last_updated                    0.0
temperature_celsius             0.0
temperature_fahrenheit          0.0
condition_text                  0.0
wind_mph                        0.0
wind_kph                        0.0
wind_degree                     0.0
wind_direction                  0.0
pressure_mb                     0.0
pressure_in                     0.0
precip_mm                       0.0
precip_in                       0.0
humidity                        0.0
cloud                           0.0
feels_like_celsius              0.0
feels_like_fahrenheit           0.0
visibility_km                   0.0
visibility_miles                0.0
uv_index                        0.0
gust_mph                        0.0
gust_kph                        0.0
air_quality_Carbon_Monoxide 

In [None]:
# Show rows with missing values
df[df.isnull().any(axis=1)]

Anomaly detection 

In [17]:

def detect_numeric_anomalies(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    anomalies = {}
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        anomaly_rows = df[(df[col] < lower) | (df[col] > upper)]
        anomalies[col] = anomaly_rows

    return anomalies

# 3. Run the function
anomaly_results = detect_numeric_anomalies(df)

# 4. Print summary
for col, rows in anomaly_results.items():
    print(f"Column: {col}")
    print(f"Anomaly count: {len(rows)}")
    print(rows.head())      # show first few anomaly rows
    print("-" * 50)


Column: latitude
Anomaly count: 0
Empty DataFrame
Columns: [country, location_name, latitude, longitude, timezone, last_updated_epoch, last_updated, temperature_celsius, temperature_fahrenheit, condition_text, wind_mph, wind_kph, wind_degree, wind_direction, pressure_mb, pressure_in, precip_mm, precip_in, humidity, cloud, feels_like_celsius, feels_like_fahrenheit, visibility_km, visibility_miles, uv_index, gust_mph, gust_kph, air_quality_Carbon_Monoxide, air_quality_Ozone, air_quality_Nitrogen_dioxide, air_quality_Sulphur_dioxide, air_quality_PM2.5, air_quality_PM10, air_quality_us-epa-index, air_quality_gb-defra-index, sunrise, sunset, moonrise, moonset, moon_phase, moon_illumination]
Index: []

[0 rows x 41 columns]
--------------------------------------------------
Column: longitude
Anomaly count: 8229
              country location_name  latitude  longitude          timezone  \
8           Australia      Canberra    -35.28     149.22  Australia/Sydney   
58       Fiji Islands      

DATA COVERAGE ANALYSIS

Total Rows & Columns

In [18]:
print("Total Number of Rows",len(df))
print("Total Number of Columns",len(df.columns))

Total Number of Rows 106988
Total Number of Columns 41


Region Coverage (Country / City / Location)

In [22]:
print("Unique Countries",df['country'].nunique())

Unique Countries 211


List of countries

In [23]:
print(df['country'].unique())

['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso'
 'Burundi' 'Madagascar' 'Cape Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Comoros' 'Congo'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Swaziland' 'Ethiopia' 'Fiji Islands' 'Finland'
 'France' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Vatican City'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazak

Records per location/city

In [29]:
print(df['location_name'].value_counts())

location_name
Kabul                    550
Tirana                   550
Andorra La Vella         550
Luanda                   550
Manama                   550
                        ... 
Kiyabo                     1
Morocco City               1
Moldova                    1
Mexico (Grupo Mexico)      1
#NAME?                     1
Name: count, Length: 254, dtype: int64


Time Coverage

In [4]:
df['last_updated']= pd.to_datetime(df['last_updated'],dayfirst=True, errors='coerce')

In [43]:
print("Earliest Day",df['last_updated'].min())
print("Latest Day",df['last_updated'].max())

Earliest Day 2024-05-16 01:45:00
Latest Day 2024-05-16 01:45:00


In [40]:
date_counts = df['last_updated'].value_counts().sort_index()
print(date_counts)


last_updated
2024-05-16 01:45:00    106988
Name: count, dtype: int64


Geographic Coverage

In [41]:
df.columns

Index(['country', 'location_name', 'latitude', 'longitude', 'timezone',
       'last_updated_epoch', 'last_updated', 'temperature_celsius',
       'temperature_fahrenheit', 'condition_text', 'wind_mph', 'wind_kph',
       'wind_degree', 'wind_direction', 'pressure_mb', 'pressure_in',
       'precip_mm', 'precip_in', 'humidity', 'cloud', 'feels_like_celsius',
       'feels_like_fahrenheit', 'visibility_km', 'visibility_miles',
       'uv_index', 'gust_mph', 'gust_kph', 'air_quality_Carbon_Monoxide',
       'air_quality_Ozone', 'air_quality_Nitrogen_dioxide',
       'air_quality_Sulphur_dioxide', 'air_quality_PM2.5', 'air_quality_PM10',
       'air_quality_us-epa-index', 'air_quality_gb-defra-index', 'sunrise',
       'sunset', 'moonrise', 'moonset', 'moon_phase', 'moon_illumination'],
      dtype='object')

In [45]:
print(df[['longitude','latitude']].isnull().sum())

longitude    0
latitude     0
dtype: int64


 Range of latitude & longitude

In [46]:
print("Range Of Latitude",df['latitude'].min() , "To", df['latitude'].max())

Range Of Latitude -41.3 To 64.15


In [47]:
print("Range of Longitude" , df['longitude'].min() , "To", df['longitude'].max())

Range of Longitude -175.2 To 179.22


In [4]:
df.isnull().sum()

country                         0
location_name                   0
latitude                        0
longitude                       0
timezone                        0
last_updated_epoch              0
last_updated                    0
temperature_celsius             0
temperature_fahrenheit          0
condition_text                  0
wind_mph                        0
wind_kph                        0
wind_degree                     0
wind_direction                  0
pressure_mb                     0
pressure_in                     0
precip_mm                       0
precip_in                       0
humidity                        0
cloud                           0
feels_like_celsius              0
feels_like_fahrenheit           0
visibility_km                   0
visibility_miles                0
uv_index                        0
gust_mph                        0
gust_kph                        0
air_quality_Carbon_Monoxide     0
air_quality_Ozone               0
air_quality_Ni