In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import pickle

# Step 1: Load the data
# data = pd.read_csv('cleaned_tmean_data.csv')  # Update this path with the correct CSV file path
data = pd.read_csv('data/Tmax_Tmin_Data__Central_Region.csv')  # Update this path with the correct CSV file path

# Step 2: Ensure Tmax and Tmin are numeric
data['Tmax'] = pd.to_numeric(data['Tmax'], errors='coerce')
data['Tmin'] = pd.to_numeric(data['Tmin'], errors='coerce')

# Drop rows with missing or non-numeric values after conversion
data = data.dropna(subset=['Tmax', 'Tmin'])

# Step 3: Calculate Tmean
data['Tmean'] = (data['Tmax'] + data['Tmin']) / 2

# Step 4: Select features and target
# Creating a copy to avoid view vs copy warning
X = data[['ProvinceName', 'Month']].copy()
y = data['Tmean']

# Encode the ProvinceName
label_encoder = LabelEncoder()
X['ProvinceName'] = label_encoder.fit_transform(X['ProvinceName'])

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Save the trained model and Label Encoder
with open('models/tmean_model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('models/label_encoder_tmean.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)


Mean Absolute Error: 0.8428370653334936
