In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.describe()

In [None]:
df['location_name']

In [None]:
locations=df['location_name'].unique()
# locations.shape

In [None]:
numerical_cols = df.select_dtypes(include=[np.number]).columns

Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Define bounds and remove outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[~((df[numerical_cols] < lower_bound) | (df[numerical_cols] > upper_bound)).any(axis=1)]


In [None]:
country_location_count = df.groupby(['country', 'location_name']).size().reset_index(name='count')

# Lets Get the number of unique countries along with their location names
unique_countries = country_location_count['country'].nunique()

print(f"Total number of unique countries along with their location names present in the dataset are: {unique_countries}")

In [None]:
highest_temp_celsius = df.loc[df['temperature_celsius'].idxmax(), ['country', 'location_name', 'temperature_celsius']]
print("Country and Location with the Highest Temperature in Celsius:")
print(highest_temp_celsius)


In [None]:
lowest_temp_celsius = df.loc[df['temperature_celsius'].idxmin(), ['country', 'location_name', 'temperature_celsius']]
print("\nCountry and Location with the Lowest Temperature in Celsius:")
print(lowest_temp_celsius)



In [None]:
df['condition_text'].unique()

In [None]:
df[numerical_cols] = (df[numerical_cols] - df[numerical_cols].min()) / (df[numerical_cols].max() - df[numerical_cols].min())

In [None]:

# Set style
sns.set_style("whitegrid")

# Line plot for Temperature over Time
plt.figure(figsize=(12, 5))
sns.lineplot(data=df, x='last_updated', y='temperature_celsius', color='red', label="Temperature (°C)")
plt.xlabel("Date")
plt.ylabel("Temperature (°C)")
plt.title("Temperature Trends Over Time")
plt.xticks(rotation=45)
plt.legend()
plt.show()

# Line plot for Precipitation over Time
plt.figure(figsize=(12, 5))
sns.lineplot(data=df, x='last_updated', y='precip_mm', color='blue', label="Precipitation (mm)")
plt.xlabel("Date")
plt.ylabel("Precipitation (mm)")
plt.title("Precipitation Trends Over Time")
plt.xticks(rotation=45)
plt.legend()
plt.show()

# Scatter plot to explore relationship between temperature and precipitation
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='temperature_celsius', y='precip_mm', alpha=0.5)
plt.xlabel("Temperature (°C)")
plt.ylabel("Precipitation (mm)")
plt.title("Temperature vs. Precipitation")
plt.show()

In [None]:
df['last_updated'] = pd.to_datetime(df['last_updated'])  # Ensure datetime format
df = df.sort_values(by='last_updated')  # Sort by time

df['temp_c_MA7'] = df['temperature_celsius'].rolling(window=7).mean()  # 7-day moving average
df['precip_mm_MA7'] = df['precip_mm'].rolling(window=7).mean()

plt.figure(figsize=(12,5))
sns.lineplot(x=df['last_updated'], y=df['temp_c_MA7'], label='Temperature (7-day MA)', color='red')
sns.lineplot(x=df['last_updated'], y=df['precip_mm_MA7'], label='Precipitation (7-day MA)', color='blue')
plt.xlabel("Date")
plt.ylabel("Value")
plt.title("Temperature & Precipitation Trends (7-day Moving Average)")
plt.xticks(rotation=45)
plt.legend()
plt.show()


In [None]:
# !pip install statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df.set_index('last_updated')['temperature_celsius'], model='additive', period=30)
result.plot()
plt.show()


In [None]:
from scipy import stats

df['temp_zscore'] = stats.zscore(df['temperature_celsius'])
anomalies = df[df['temp_zscore'].abs() > 3]  # Threshold: |Z| > 3

plt.figure(figsize=(12,5))
sns.scatterplot(x=df['last_updated'], y=df['temperature_celsius'], label="Normal", alpha=0.6)
sns.scatterplot(x=anomalies['last_updated'], y=anomalies['temperature_celsius'], color='red', label="Anomalies")
plt.xlabel("Date")
plt.ylabel("Temperature (°C)")
plt.title("Temperature Anomalies Over Time")
plt.legend()
plt.show()


In [None]:
from sklearn.cluster import KMeans

# Select key features for clustering
X = df[['temperature_celsius', 'humidity', 'wind_kph', 'precip_mm']].dropna()

# Perform clustering
kmeans = KMeans(n_clusters=3, random_state=42)
df['weather_cluster'] = kmeans.fit_predict(X)

# Visualize clusters
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='temperature_celsius', y='precip_mm', hue='weather_cluster', palette='coolwarm')
plt.xlabel("Temperature (°C)")
plt.ylabel("Precipitation (mm)")
plt.title("Weather Clusters Based on Temperature & Humidity")
plt.show()


In [None]:
numeric_df = df.select_dtypes(include=['number'])

# Compute correlation matrix
correlation_matrix = numeric_df.corr()

# Plot heatmap


plt.figure(figsize=(20,12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Weather Variables")
plt.show()


In [None]:

# Load dataset
df=pd.read_csv("GlobalWeatherRepository.csv")

# Data preprocessing
# Convert 'last_updated' to datetime and set as index
df['last_updated'] = pd.to_datetime(df['last_updated'])
df = df.set_index('last_updated').sort_index()

# Handle missing values in 'temperature_celsius'
df['temperature_celsius'] = df['temperature_celsius'].fillna(method='ffill')

# Resample to daily data (adjust frequency as needed)
daily_df = df[['temperature_celsius']].resample('D').mean().reset_index()

# Prepare data for Prophet
# Prophet requires columns 'ds' (datetime) and 'y' (target variable)
prophet_df = daily_df.rename(columns={'last_updated': 'ds', 'temperature_celsius': 'y'})

# Split data into train and test
split_date = prophet_df['ds'].iloc[-30]  # Last 30 days for testing
train = prophet_df[prophet_df['ds'] < split_date]
test = prophet_df[prophet_df['ds'] >= split_date]

# Initialize and fit Prophet model
model = Prophet(
    yearly_seasonality=True,  # Enable yearly seasonality
    weekly_seasonality=True,  # Enable weekly seasonality
    daily_seasonality=False   # Disable daily seasonality (since we're using daily data)
)
model.fit(train)

# Create future dataframe for predictions
future = model.make_future_dataframe(periods=len(test))  # Forecast for the test period

# Make predictions
forecast = model.predict(future)

# Extract predictions for the test period
forecast_df = forecast[['ds', 'yhat']].rename(columns={'ds': 'date', 'yhat': 'prediction'})
forecast_df = forecast_df.set_index('date')
forecast_df = forecast_df[forecast_df.index >= split_date]

# Evaluate performance
def evaluate_forecast(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error
    r2 = r2_score(y_true, y_pred)  # R-squared
    
    return {
        'MAE': round(mae, 2),
        'MSE': round(mse, 2),
        'RMSE': round(rmse, 2),
        'MAPE': round(mape, 2),
        'R-squared': round(r2, 2),
        'Accuracy': round(100 - mape, 2)  # Accuracy based on MAPE
    }

evaluation = evaluate_forecast(test['y'], forecast_df['prediction'])
print("\nEvaluation Metrics:")
for metric, value in evaluation.items():
    print(f"{metric}: {value}")

# Visualization
plt.figure(figsize=(12, 6))
plt.plot(train['ds'], train['y'], label='Training Data')
plt.plot(test['ds'], test['y'], label='Actual Values')
plt.plot(forecast_df.index, forecast_df['prediction'], label='Predictions')
plt.title('Temperature Forecast Evaluation (Prophet)')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.show()

# Plot Prophet's components (trend, seasonality, etc.)
model.plot_components(forecast)
plt.show()

In [None]:

# Load dataset
df=pd.read_csv("GlobalWeatherRepository.csv")

# Data preprocessing
df['last_updated'] = pd.to_datetime(df['last_updated'])
df = df.set_index('last_updated').sort_index()

# Handle missing values
df['temperature_celsius'] = df['temperature_celsius'].fillna(method='ffill')

# Resample to daily data
daily_df = df[['temperature_celsius']].resample('D').mean().reset_index()

# --- Critical Fix: Ensure no NaN/Inf in target after resampling ---
daily_df['temperature_celsius'] = daily_df['temperature_celsius'].fillna(method='ffill')
daily_df = daily_df.replace([np.inf, -np.inf], np.nan).dropna()

# Feature engineering for XGBoost
daily_df['day_of_week'] = daily_df['last_updated'].dt.dayofweek
daily_df['month'] = daily_df['last_updated'].dt.month
daily_df['year'] = daily_df['last_updated'].dt.year

# Split data
split_date = daily_df['last_updated'].iloc[-30]
train = daily_df[daily_df['last_updated'] < split_date]
test = daily_df[daily_df['last_updated'] >= split_date]

# --------------------------
# Model 1: Prophet
# --------------------------
prophet_df = train.rename(columns={'last_updated': 'ds', 'temperature_celsius': 'y'})
prophet_test = test.rename(columns={'last_updated': 'ds', 'temperature_celsius': 'y'})

model_prophet = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
model_prophet.fit(prophet_df)

future = model_prophet.make_future_dataframe(periods=len(test))
forecast_prophet = model_prophet.predict(future)
prophet_preds = forecast_prophet[forecast_prophet['ds'].isin(test['last_updated'])]['yhat'].values

# --------------------------
# Model 2: XGBRegressor (with data validation)
# --------------------------
X_train = train[['day_of_week', 'month', 'year']]
y_train = train['temperature_celsius'].astype(np.float32)  # Ensure float32 dtype
X_test = test[['day_of_week', 'month', 'year']]
y_test = test['temperature_celsius'].astype(np.float32)

# Check for NaN/inf in features and labels
assert not X_train.isnull().values.any(), "NaN in X_train!"
assert not X_test.isnull().values.any(), "NaN in X_test!"
assert not np.isinf(y_train).any(), "Inf in y_train!"
assert not np.isinf(y_test).any(), "Inf in y_test!"

# Train XGBoost
model_xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    random_state=42
)
model_xgb.fit(X_train, y_train)
xgb_preds = model_xgb.predict(X_test)

# --------------------------
# Ensemble Predictions
# --------------------------
ensemble_preds = (prophet_preds + xgb_preds) / 2

# Evaluate
def evaluate_forecast(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    
    return {
        'MAE': round(mae, 2),
        'MSE': round(mse, 2),
        'RMSE': round(rmse, 2),
        'MAPE': round(mape, 2),
        'R-squared': round(r2, 2),
        'Accuracy': round(100 - mape, 2)
    }

print("Prophet Metrics:", evaluate_forecast(y_test, prophet_preds))
print("XGBoost Metrics:", evaluate_forecast(y_test, xgb_preds))
print("Ensemble Metrics:", evaluate_forecast(y_test, ensemble_preds))

# Visualization
plt.figure(figsize=(14, 7))
plt.plot(test['last_updated'], y_test, label='Actual', color='black')
plt.plot(test['last_updated'], prophet_preds, label='Prophet', linestyle='--')
plt.plot(test['last_updated'], xgb_preds, label='XGBoost', linestyle='--')
plt.plot(test['last_updated'], ensemble_preds, label='Ensemble', linewidth=2)
plt.title('Ensemble Forecast Comparison')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor
import geopandas as gpd

# Load dataset
df=pd.read_csv("GlobalWeatherRepository.csv")

# Convert datetime
df['last_updated'] = pd.to_datetime(df['last_updated'])

# 1. Climate Analysis: Long-term patterns
def climate_analysis(df):
    # Resample to annual data
    climate_df = df.set_index('last_updated').resample('Y').agg({
        'temperature_celsius': 'mean',
        'precip_mm': 'sum',
        'humidity': 'mean',
        'cloud': 'mean'
    }).reset_index()
    
    # Temperature trends
    fig = px.line(climate_df, x='last_updated', y='temperature_celsius',
                 title='Long-term Temperature Trends',
                 labels={'temperature_celsius': 'Temperature (°C)', 'last_updated': 'Year'})
    fig.show()
    
    # Regional analysis
    regional_df = df.groupby(['country', pd.Grouper(key='last_updated', freq='Y')]).agg({
        'temperature_celsius': 'mean',
        'precip_mm': 'sum'
    }).reset_index()
    
    fig = px.box(regional_df, x='country', y='temperature_celsius',
                title='Temperature Distribution by Country')
    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

# 2. Environmental Impact Analysis
def environmental_impact(df):
    # Air quality correlations
    air_quality_cols = ['air_quality_PM2.5', 'air_quality_PM10',
                       'air_quality_Nitrogen_dioxide', 'air_quality_Ozone']
    
    corr_matrix = df[['temperature_celsius', 'humidity', 'wind_kph', 
                     'pressure_mb'] + air_quality_cols].corr()
    
    fig = px.imshow(corr_matrix,
                   labels=dict(x="Features", y="Features", color="Correlation"),
                   x=corr_matrix.columns,
                   y=corr_matrix.columns,
                   title='Weather Parameters vs Air Quality Correlations')
    fig.show()
    
    # PM2.5 vs Meteorological factors
    fig = px.scatter(df, x='wind_kph', y='air_quality_PM2.5', color='humidity',
                    title='PM2.5 Concentration vs Wind Speed and Humidity',
                    labels={'wind_kph': 'Wind Speed (kph)', 'air_quality_PM2.5': 'PM2.5'},
                    hover_data=['country', 'last_updated'])
    fig.show()

# 3. Feature Importance Analysis
def feature_importance(df):
    features = ['latitude', 'longitude', 'humidity', 'wind_kph', 
               'pressure_mb', 'cloud', 'uv_index']
    target = 'temperature_celsius'
    
    model = XGBRegressor()
    model.fit(df[features], df[target])
    
    # XGBoost feature importance
    importance_df = pd.DataFrame({
        'Feature': features,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    fig = px.bar(importance_df, x='Importance', y='Feature',
                title='XGBoost Feature Importance',
                orientation='h')
    fig.show()
    
    # Permutation importance
    result = permutation_importance(model, df[features], df[target], n_repeats=10)
    sorted_idx = result.importances_mean.argsort()
    
    fig = go.Figure()
    for i in sorted_idx:
        fig.add_trace(go.Box(
            y=result.importances[i],
            name=features[i],
            boxpoints=False
        ))
    fig.update_layout(title="Permutation Importance",
                     yaxis_title="Features",
                     xaxis_title="Importance Score")
    fig.show()

# 4. Spatial Analysis with Plotly
def spatial_analysis(df):
    # Temperature map
    fig = px.scatter_geo(df,
                        lat='latitude',
                        lon='longitude',
                        color='temperature_celsius',
                        hover_name='country',
                        scope='world',
                        color_continuous_scale='Viridis',
                        title='Global Temperature Distribution')
    fig.update_geos(showcountries=True, showcoastlines=True)
    fig.show()
    
    # Air quality heatmap
    fig = px.density_mapbox(df,
                           lat='latitude',
                           lon='longitude',
                           z='air_quality_PM2.5',
                           radius=10,
                           center=dict(lat=0, lon=0),
                           zoom=0,
                           mapbox_style="open-street-map",
                           title='Global PM2.5 Distribution',
                           color_continuous_scale='Hot')
    fig.show()

# 5. Geographical Patterns
def geographical_patterns(df):
    # Continent-level analysis
    continent_df = df.groupby('country').agg({
        'temperature_celsius': 'mean',
        'humidity': 'mean',
        'wind_kph': 'mean',
        'air_quality_PM2.5': 'mean'
    }).reset_index()
    
    fig = px.choropleth(continent_df,
                       locations="country",
                       locationmode='country names',
                       color="temperature_celsius",
                       hover_name="country",
                       color_continuous_scale=px.colors.sequential.Plasma,
                       title="Average Temperature by Country")
    fig.show()

# Execute analyses
climate_analysis(df.copy())





In [None]:
environmental_impact(df.copy())

In [None]:
feature_importance(df.copy())

In [None]:
spatial_analysis(df.copy())

In [None]:
geographical_patterns(df.copy())

In [None]:
!ls -lh

In [1]:
!pip list

Package                   Version
------------------------- --------------
anyio                     4.8.0
appnope                   0.1.4
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.4
attrs                     25.1.0
babel                     2.17.0
beautifulsoup4            4.13.3
bleach                    6.2.0
certifi                   2025.1.31
cffi                      1.17.1
charset-normalizer        3.4.1
cmdstanpy                 1.2.5
comm                      0.2.2
contourpy                 1.3.1
cycler                    0.12.1
debugpy                   1.8.12
decorator                 5.2.1
defusedxml                0.7.1
executing                 2.2.0
fastjsonschema            2.21.1
filelock                  3.17.0
fonttools                 4.56.0
fqdn                      1.5.1
fsspec                    2025.2.0
geopandas                 1.0.1
h11        