In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error, explained_variance_score
)

# Load the dataset (replace 'weather_station_data.csv' with your actual file)
data = pd.read_csv('/content/weather-automated-sensors-dataset.csv')  # Replace with your dataset path

# Display the first few rows of the dataset
print(data.head())

# Drop non-numeric columns (such as timestamps and categorical data)
numeric_columns = data.select_dtypes(include=[np.number]).columns
data_numeric = data[numeric_columns]

# Handle missing values by filling NaN with the median of each column
data_numeric = data_numeric.fillna(data_numeric.median())

# Define features (X) and target (y)
# Here, we'll predict 'Air Temperature' (adjust if predicting something else)
X = data_numeric.drop('Air Temperature', axis=1)
y = data_numeric['Air Temperature']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
dt_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = dt_model.predict(X_test)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred)  # R-squared
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
rmse = np.sqrt(mse)  # Root Mean Squared Error
mape = mean_absolute_percentage_error(y_test, y_pred)  # Mean Absolute Percentage Error
explained_variance = explained_variance_score(y_test, y_pred)  # Explained Variance Score

# Convert R-squared into percentage for readability
r2_percentage = r2 * 100

# Print all evaluation metrics
print(f"Model R-squared (in percentage): {r2_percentage:.2f}%")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")
print(f"Explained Variance Score: {explained_variance:.4f}")

# Optional: Feature importance
importances = dt_model.feature_importances_
features = X.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)


                 Station Name   Measurement Timestamp  Air Temperature  \
0  Oak Street Weather Station  05/22/2015 03:00:00 PM              NaN   
1  Oak Street Weather Station  05/22/2015 05:00:00 PM              NaN   
2  Oak Street Weather Station  05/22/2015 06:00:00 PM              NaN   
3  Oak Street Weather Station  05/22/2015 07:00:00 PM              NaN   
4  Oak Street Weather Station  05/22/2015 08:00:00 PM              NaN   

   Wet Bulb Temperature  Humidity  Rain Intensity  Interval Rain  Total Rain  \
0                   7.0      55.0             0.0            0.0         1.4   
1                   6.3      56.0             0.0            0.0         1.4   
2                   6.5      54.0             0.0            0.0         1.4   
3                   6.3      53.0             0.0            0.0         1.4   
4                   6.4      52.0             0.0            0.0         1.4   

   Precipitation Type  Wind Direction  Wind Speed  Maximum Wind Speed  \
0