In [None]:
from google.colab import files
files.upload()

# Basic Assessment




In [None]:
import pandas as pd

# Load the uploaded file
df = pd.read_csv('GlobalWeatherRepository.csv')

# Basic info
print("Dataset shape:", df.shape)
df.head()


 Data Cleaning & Preprocessing

In [None]:
# Check missing values
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)


In [None]:
threshold = 0.3
df_cleaned = df.dropna(thresh=int((1 - threshold) * len(df)), axis=1)


In [None]:
df_cleaned['last_updated'] = pd.to_datetime(df_cleaned['last_updated'], errors='coerce')


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_cleaned[['temperature_celsius', 'air_quality_PM2.5']] = scaler.fit_transform(
    df_cleaned[['temperature_celsius', 'air_quality_PM2.5']]
)


Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
sns.histplot(df_cleaned['temperature_celsius'], bins=30, kde=True, color='skyblue')
plt.title('Global Temperature Distribution (°C)')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df_cleaned['air_quality_PM2.5'], bins=30, kde=True, color='salmon')
plt.title('Air Quality (PM2.5) Distribution')
plt.xlabel('PM2.5')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
avg_temp_by_country = df_cleaned.groupby('country')['temperature_celsius'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=avg_temp_by_country.values, y=avg_temp_by_country.index, palette='coolwarm')
plt.title('Top 10 Hottest Countries (Avg Temp °C)')
plt.xlabel('Average Temperature (°C)')
plt.ylabel('Country')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df_cleaned[['temperature_celsius', 'air_quality_PM2.5', 'latitude', 'longitude']].corr(), annot=True, cmap='YlGnBu')
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
df_cleaned['location_name'].value_counts().head(10)


In [None]:
df_country = df_cleaned[df_cleaned['location_name'] == 'Andorra La Vella'].sort_values('last_updated')
plt.figure(figsize=(12, 5))
plt.plot(df_country['last_updated'], df_country['temperature_celsius'], marker='o', linestyle='-', color='green')
plt.title('Temperature Over Time – Andorra La Vella')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


 Forecasting Model 1

In [None]:
tirana_df = df_cleaned[df_cleaned['location_name'] == 'Tirana'].copy()
tirana_df = tirana_df.sort_values('last_updated')
tirana_df.set_index('last_updated', inplace=True)
tirana_df[['temperature_celsius']].plot(figsize=(12, 5), title='Temperature in Tirana')


In [None]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(tirana_df['temperature_celsius'])
print(f"ADF Statistic: {result[0]}")
print(f"p-value: {result[1]}")


In [None]:
# First-order differencing to remove trend
tirana_df['temp_diff'] = tirana_df['temperature_celsius'].diff()

# Drop NaN caused by differencing
tirana_diff = tirana_df['temp_diff'].dropna()

# Re-run ADF test on differenced data
result_diff = adfuller(tirana_diff)
print(f"Differenced ADF Statistic: {result_diff[0]}")
print(f"Differenced p-value: {result_diff[1]}")


In [None]:
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import pandas as pd

# Fit ARIMA model on original temperature series with differencing (d=1)
model = ARIMA(tirana_df['temperature_celsius'], order=(1, 1, 1))
model_fit = model.fit()

# Forecast next 30 days
forecast = model_fit.forecast(steps=30)
arima_forecast = forecast  # <- Save to use in ensemble

# Plot original + forecast
plt.figure(figsize=(12, 5))
plt.plot(tirana_df.index, tirana_df['temperature_celsius'], label='Actual')
plt.plot(pd.date_range(start=tirana_df.index[-1], periods=30, freq='D'),
         forecast, color='red', label='Forecast')
plt.title("Temperature Forecast – Tirana (ARIMA)")
plt.xlabel("Date")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.show()


Forecasting Model 2


In [None]:
# Filter and prepare Kabul data
kabul_df = df_cleaned[df_cleaned['location_name'] == 'Kabul'].copy()
kabul_df = kabul_df.sort_values('last_updated')
kabul_df.set_index('last_updated', inplace=True)

# Plot to visualize trend
kabul_df[['temperature_celsius']].plot(figsize=(12, 5), title='Temperature in Kabul')


In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Fit the model – no seasonality for simplicity
ets_model = ExponentialSmoothing(
    kabul_df['temperature_celsius'],
    trend='add',
    seasonal=None,
    initialization_method='estimated'
).fit()

# Forecast next 30 days
ets_forecast = ets_model.forecast(30)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Generate future date index
future_dates = pd.date_range(start=kabul_df.index[-1] + pd.Timedelta(days=1), periods=30)

# Plot actual + forecast
plt.figure(figsize=(12, 5))
plt.plot(kabul_df.index, kabul_df['temperature_celsius'], label='Actual')
plt.plot(future_dates, ets_forecast, label='Forecast (ETS)', color='green')
plt.title('Temperature Forecast – Kabul (ETS)')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)
plt.show()


Model Evaluation (ARIMA vs ETS)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def evaluate_forecast(y_true, y_pred, model_name="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} MAE: {mae:.4f}")
    print(f"{model_name} RMSE: {rmse:.4f}")


In [None]:
# Evaluation for ARIMA
# Split Tirana data
train_tirana = tirana_df['temperature_celsius'][:-30]
test_tirana = tirana_df['temperature_celsius'][-30:]

# Fit ARIMA
from statsmodels.tsa.arima.model import ARIMA
model_tirana = ARIMA(train_tirana, order=(1,1,1)).fit()

# Forecast
preds_tirana = model_tirana.forecast(30)

# Evaluate
evaluate_forecast(test_tirana, preds_tirana, model_name="ARIMA (Tirana)")


In [None]:
#Evaluation fro ETS
# Split Kabul data
train_kabul = kabul_df['temperature_celsius'][:-30]
test_kabul = kabul_df['temperature_celsius'][-30:]

# Fit ETS
ets_model_kabul = ExponentialSmoothing(
    train_kabul, trend='add', seasonal=None, initialization_method='estimated'
).fit()

# Forecast
preds_kabul = ets_model_kabul.forecast(30)

# Evaluate
evaluate_forecast(test_kabul, preds_kabul, model_name="ETS (Kabul)")


forecast model 3

In [None]:
# Prophet Forecasting for Yerevan
from prophet import Prophet

# Step 1: Prepare the data
prophet_df = yerevan_df[['last_updated_epoch', 'temperature_celsius']].copy()
prophet_df['ds'] = pd.to_datetime(prophet_df['last_updated_epoch'], unit='s')
prophet_df['y'] = prophet_df['temperature_celsius']
prophet_df = prophet_df[['ds', 'y']]

# Step 2: Fit the model
prophet_model = Prophet()
prophet_model.fit(prophet_df)

# Step 3: Forecast
future = prophet_model.make_future_dataframe(periods=30)
forecast_prophet = prophet_model.predict(future)

# Step 4: Plot
prophet_model.plot(forecast_prophet)
plt.title("Prophet Forecast for Yerevan")
plt.show()

In [None]:
yerevan_df['date'] = pd.to_datetime(yerevan_df['last_updated_epoch'], unit='s')


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Set datetime as index
yerevan_df.set_index('date', inplace=True)

# Actual values: last 30 days
actual_yerevan = yerevan_df['temperature_celsius'].iloc[-30:]

# Forecasted values: first 30 predicted days from Prophet
predicted_yerevan = forecast_prophet.set_index('ds')['yhat'].iloc[:30]

# Align indexes for comparison
actual_yerevan.index = predicted_yerevan.index

# Compute metrics
mae_yerevan = mean_absolute_error(actual_yerevan, predicted_yerevan)
rmse_yerevan = np.sqrt(mean_squared_error(actual_yerevan, predicted_yerevan))

print(f"Prophet (Yerevan) MAE: {mae_yerevan:.4f}")
print(f"Prophet (Yerevan) RMSE: {rmse_yerevan:.4f}")


In [None]:
# Ensemble Forecast: Prophet + ETS + ARIMA

# Step 1: Align all forecasts to 30 future steps
prophet_forecast_30 = forecast_prophet[['ds', 'yhat']].tail(30).reset_index(drop=True)
ets_forecast_30 = pd.Series(ets_forecast[:30]).reset_index(drop=True)
arima_forecast_30 = pd.Series(arima_forecast[:30]).reset_index(drop=True)

# Step 2: Create a combined dataframe
ensemble_df = pd.DataFrame({
    'date': prophet_forecast_30['ds'],
    'Prophet_Yerevan': prophet_forecast_30['yhat'],
    'ETS_Kabul': ets_forecast_30,
    'ARIMA_Tirana': arima_forecast_30
})

# Step 3: Calculate ensemble average
ensemble_df['Ensemble_Average'] = ensemble_df[['Prophet_Yerevan', 'ETS_Kabul', 'ARIMA_Tirana']].mean(axis=1)

# Step 4: Plot the ensemble
plt.figure(figsize=(12, 6))
plt.plot(ensemble_df['date'], ensemble_df['Prophet_Yerevan'], label='Prophet (Yerevan)', linestyle='--')
plt.plot(ensemble_df['date'], ensemble_df['ETS_Kabul'], label='ETS (Kabul)', linestyle='--')
plt.plot(ensemble_df['date'], ensemble_df['ARIMA_Tirana'], label='ARIMA (Tirana)', linestyle='--')
plt.plot(ensemble_df['date'], ensemble_df['Ensemble_Average'], label='Ensemble Average', linewidth=2, color='black')
plt.title('30-Day Forecast Ensemble (Yerevan, Kabul, Tirana)')
plt.xlabel('Date')
plt.ylabel('Forecasted Temperature (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


1. ARIMA Model – Tirana
Model Used: ARIMA (AutoRegressive Integrated Moving Average)

Preprocessing: Required stationarity; ADF test p-value ≈ 0.703 → differencing made it stationary (p-value ≈ 2.8e-19)

Forecast: 30-day temperature forecast

Evaluation:

MAE: 0.0487

RMSE: 0.0555

Insights: Captured short-term variations well, but less robust to seasonal or non-linear patterns.

2. ETS Model – Kabul
Model Used: ETS (Exponential Smoothing – Holt-Winters)

Preprocessing: No need for differencing or stationarity

Forecast: 30-day forecast on raw data

Evaluation:

MAE: 0.0474

RMSE: 0.0503

Insights: Slightly better than ARIMA. ETS modeled the trend and smoother fluctuations effectively.

3. Prophet Model – Yerevan
Model Used: Prophet (Facebook’s additive time series model)

Preprocessing: Converted epoch timestamps to datetime

Forecast: 30-day forecast using trend + seasonality modeling

Evaluation:

MAE: 6.0674

RMSE: 6.7243

Insights: Prophet struggled in this case, likely due to noise, outliers, or lack of seasonality in Yerevan’s data. May benefit from more tuning or external regressors.

# Advanced Assessment

Advanced Analysis

In [None]:
# Let's start by filtering the dataset to focus only on Yerevan
yerevan_df = df[df['location_name'] == 'Yerevan'].copy()

# We’ll keep only the columns we care about for anomaly detection
# 'temperature_celsius' for trend analysis and 'last_updated_epoch' to build a timeline
yerevan_df = yerevan_df[['temperature_celsius', 'last_updated_epoch']].dropna().reset_index(drop=True)

# Convert the UNIX epoch timestamp into readable datetime format
# This makes it easier to visualize temperature trends over time
yerevan_df['date'] = pd.to_datetime(yerevan_df['last_updated_epoch'], unit='s')

# Set the date column as the index for easy time series operations
yerevan_df.set_index('date', inplace=True)

# Take a quick look at the prepared data
yerevan_df.head()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Set window size for rolling average (we’ll look at 7-day trends)
rolling_window = 7

# Calculate rolling mean and standard deviation
yerevan_df['rolling_mean'] = yerevan_df['temperature_celsius'].rolling(window=rolling_window).mean()
yerevan_df['rolling_std'] = yerevan_df['temperature_celsius'].rolling(window=rolling_window).std()

# Set thresholds for anomaly detection
# Anomalies are values 2 standard deviations above or below the rolling average
yerevan_df['anomaly'] = np.where(
    (yerevan_df['temperature_celsius'] > yerevan_df['rolling_mean'] + 2 * yerevan_df['rolling_std']) |
    (yerevan_df['temperature_celsius'] < yerevan_df['rolling_mean'] - 2 * yerevan_df['rolling_std']),
    True,
    False
)

# Plot the results
plt.figure(figsize=(14, 6))
plt.plot(yerevan_df.index, yerevan_df['temperature_celsius'], label='Temperature', color='blue')
plt.plot(yerevan_df.index, yerevan_df['rolling_mean'], label='7-Day Rolling Mean', color='orange')
plt.scatter(yerevan_df[yerevan_df['anomaly']].index,
            yerevan_df[yerevan_df['anomaly']]['temperature_celsius'],
            color='red', label='Anomalies', s=50)
plt.title('Temperature Anomalies in Yerevan')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Select features of interest for environmental impact analysis
env_impact_df = df_cleaned[[
    'temperature_celsius', 'humidity',
    'air_quality_PM2.5', 'air_quality_Ozone', 'air_quality_Carbon_Monoxide'
]].dropna()

# Compute correlation matrix to check how weather influences air quality
correlation_matrix = env_impact_df.corr()

# Display correlations
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation between Weather and Air Quality Metrics")
plt.show()


In [None]:
# Set up a 1x3 subplot for side-by-side comparison
fig, axs = plt.subplots(1, 3, figsize=(18, 5))

#  Temperature vs PM2.5
axs[0].scatter(env_impact_df['temperature_celsius'], env_impact_df['air_quality_PM2.5'], alpha=0.5)
axs[0].set_title('Temperature vs PM2.5')
axs[0].set_xlabel('Temperature (°C)')
axs[0].set_ylabel('PM2.5')

#  Temperature vs Ozone
axs[1].scatter(env_impact_df['temperature_celsius'], env_impact_df['air_quality_Ozone'], alpha=0.5, color='orange')
axs[1].set_title('Temperature vs Ozone')
axs[1].set_xlabel('Temperature (°C)')
axs[1].set_ylabel('Ozone')

#  Humidity vs CO
axs[2].scatter(env_impact_df['humidity'], env_impact_df['air_quality_Carbon_Monoxide'], alpha=0.5, color='green')
axs[2].set_title('Humidity vs CO')
axs[2].set_xlabel('Humidity (%)')
axs[2].set_ylabel('Carbon Monoxide')

plt.suptitle("Weather vs Air Quality: Scatter Plot Visualizations", fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sns

#  Step 1: Prepare feature set and target
X = env_impact_df.drop(columns=['air_quality_PM2.5'])  # All features except target
y = env_impact_df['air_quality_PM2.5']                 # Target variable

# Step 2: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Step 3: Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

#  Step 4: Evaluate model performance
y_pred = rf_model.predict(X_test)
print("Random Forest R² Score:", r2_score(y_test, y_pred))

#  Step 5: Get feature importances
importances = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

#  Step 6: Visualize top features
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
plt.title("Top 10 Important Features Affecting PM2.5 Levels")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px

# subset of important columns
map_df = df[['location_name', 'country', 'latitude', 'longitude', 'temperature_celsius', 'air_quality_PM2.5']].copy()

# Drop any rows with missing coordinates or temp
map_df.dropna(subset=['latitude', 'longitude', 'temperature_celsius'], inplace=True)

# Plotting average temperature per city
fig = px.scatter_geo(
    map_df,
    lat='latitude',
    lon='longitude',
    text='location_name',
    hover_name='country',
    color='temperature_celsius',
    color_continuous_scale='RdYlBu_r',
    title=' Global City Temperatures',
    size_max=15,
    template='plotly_dark',
    projection='natural earth'
)

fig.update_layout(geo=dict(showland=True, landcolor="white"), title_x=0.5)
fig.show()


In [None]:
# Grouping by country and computing mean weather & air quality stats
country_stats = df.groupby('country')[['temperature_celsius', 'humidity', 'air_quality_PM2.5']].mean().reset_index()

# Preview the summary
country_stats.head()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot to compare temperature and PM2.5 by country
plt.figure(figsize=(12, 6))
sns.scatterplot(data=country_stats, x='temperature_celsius', y='air_quality_PM2.5', hue='country', palette='tab20', legend=False)

plt.title('Average Temperature vs PM2.5 by Country')
plt.xlabel('Avg Temperature (°C)')
plt.ylabel('Avg PM2.5 (Air Pollution)')
plt.grid(True)
plt.show()


In [None]:
# Top 10 countries with highest PM2.5
top_polluted = country_stats.sort_values(by='air_quality_PM2.5', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=top_polluted, x='air_quality_PM2.5', y='country', palette='Reds_r')
plt.title('Top 10 Countries with Highest PM2.5 Levels')
plt.xlabel('Avg PM2.5')
plt.ylabel('Country')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Select only numeric features
numeric_df = df.select_dtypes(include='number').dropna()

# Set target and features
X = numeric_df.drop(columns=['temperature_celsius'])
y = numeric_df['temperature_celsius']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Extract feature importances
importances = rf.feature_importances_
features = X.columns

# Sort and visualize
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)
importance_df.head(10)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(10), palette='viridis')
plt.title('Top 10 Important Features for Predicting Temperature')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
