In [2]:
# Step 1: Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Step 2: Load the dataset
url = 'https://github.com/dsrscientist/Dataset2/blob/main/temperature.csv?raw=true'
data = pd.read_csv(url)

# Step 3: Inspect the dataset
print("Dataset Information:")
print(data.info())

print("\nFirst few rows of the dataset:")
print(data.head())

# Step 4: Check for missing values
print("\nMissing values in the dataset:")
print(data.isnull().sum())

# Step 5: Handle missing values
# Fill missing values for numerical columns
data.fillna(data.select_dtypes(include=[np.number]).mean(), inplace=True)

# Step 6: Select Features and Targets
# Features: We'll use the following columns for predictions:
features = [
    'Present_Tmax', 'Present_Tmin', 'LDAPS_RHmin', 'LDAPS_RHmax', 
    'LDAPS_Tmax_lapse', 'LDAPS_Tmin_lapse', 'LDAPS_WS', 'LDAPS_LH', 
    'LDAPS_CC1', 'LDAPS_CC2', 'LDAPS_CC3', 'LDAPS_CC4', 
    'LDAPS_PPT1', 'LDAPS_PPT2', 'LDAPS_PPT3', 'LDAPS_PPT4', 
    'lat', 'lon', 'DEM', 'Slope', 'Solar radiation'
]

X = data[features]

# Targets: Next day maximum and minimum temperatures
y_max = data['Next_Tmax']  # For predicting next-day maximum temperature
y_min = data['Next_Tmin']  # For predicting next-day minimum temperature

# Step 7: Data Preprocessing - Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 8: Split the dataset into training and testing sets
# For Next_Tmax (Maximum Temperature)
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_scaled, y_max, test_size=0.2, random_state=42)

# For Next_Tmin (Minimum Temperature)
X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X_scaled, y_min, test_size=0.2, random_state=42)

# Step 9: Train the Random Forest Regressor for Maximum Temperature (Next_Tmax)
reg_max = RandomForestRegressor(random_state=42)
reg_max.fit(X_train_max, y_train_max)

# Step 10: Predict and Evaluate the Maximum Temperature Model
y_pred_max = reg_max.predict(X_test_max)
print("\nMean Squared Error for Next_Tmax (Maximum Temperature Prediction):")
print(mean_squared_error(y_test_max, y_pred_max))

# Step 11: Train the Random Forest Regressor for Minimum Temperature (Next_Tmin)
reg_min = RandomForestRegressor(random_state=42)
reg_min.fit(X_train_min, y_train_min)

# Step 12: Predict and Evaluate the Minimum Temperature Model
y_pred_min = reg_min.predict(X_test_min)
print("\nMean Squared Error for Next_Tmin (Minimum Temperature Prediction):")
print(mean_squared_error(y_test_min, y_pred_min))

# Step 13: Results Summary
print("\nResults Summary:")
print(f"Mean Squared Error (Next_Tmax Prediction): {mean_squared_error(y_test_max, y_pred_max)}")
print(f"Mean Squared Error (Next_Tmin Prediction): {mean_squared_error(y_test_min, y_pred_min)}")


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7752 entries, 0 to 7751
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   station           7750 non-null   float64
 1   Date              7750 non-null   object 
 2   Present_Tmax      7682 non-null   float64
 3   Present_Tmin      7682 non-null   float64
 4   LDAPS_RHmin       7677 non-null   float64
 5   LDAPS_RHmax       7677 non-null   float64
 6   LDAPS_Tmax_lapse  7677 non-null   float64
 7   LDAPS_Tmin_lapse  7677 non-null   float64
 8   LDAPS_WS          7677 non-null   float64
 9   LDAPS_LH          7677 non-null   float64
 10  LDAPS_CC1         7677 non-null   float64
 11  LDAPS_CC2         7677 non-null   float64
 12  LDAPS_CC3         7677 non-null   float64
 13  LDAPS_CC4         7677 non-null   float64
 14  LDAPS_PPT1        7677 non-null   float64
 15  LDAPS_PPT2        7677 non-null   float64
 16  LDAPS_PPT3        767