In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor

Load the processed CSV file

In [2]:
input_file = './data/processed_data.csv'

In [3]:
df = pd.read_csv(input_file)

Remove rows with NaN values

In [4]:
df.dropna(inplace=True)

Define the threshold for removing extreme Y values

In [5]:
threshold = 3.5  # Adjust this value based on your dataset and requirements

Calculate the mean and standard deviation of Y

In [6]:
mean_Y = np.mean(df['Temperature'])
std_Y = np.std(df['Temperature'])

Define the range of acceptable Y values

In [7]:
lower_bound = mean_Y - threshold * std_Y
upper_bound = mean_Y + threshold * std_Y

Filter out rows with Y values outside the acceptable range

In [8]:
df = df[(df['Temperature'] >= lower_bound) & (df['Temperature'] <= upper_bound)]

Extract the columns

In [9]:
X = df[['Month', 'Date', 'Time', 'Previous Day Average', 'Two Days Before Average', 'Three Days Before Average', 'Last 7 Days Average']].values
Y = df['Temperature'].values

Normalize X values

In [10]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

Split the data into training and testing sets

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Define the model

In [12]:
model = GradientBoostingRegressor()

Train the model

In [13]:
model.fit(X_train, Y_train)

Make predictions

In [14]:
predictions = model.predict(X_test)

Evaluate the model

In [15]:
mse = mean_squared_error(Y_test, predictions)

In [16]:
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 1.2769310083587393


Print some example predictions

In [17]:
for i in range(10):
    print("Predicted:", predictions[i])
    print("Actual:", Y_test[i])
    print()

Predicted: 21.060182913238005
Actual: 21.6

Predicted: 21.577951229334786
Actual: 21.4

Predicted: 10.747709740608677
Actual: 8.7

Predicted: 21.300332146883534
Actual: 21.4

Predicted: 3.829762135320026
Actual: 3.3

Predicted: 11.6414864786417
Actual: 11.4

Predicted: 23.397289816359315
Actual: 23.8

Predicted: 6.498313810397577
Actual: 8.2

Predicted: 19.5006263559769
Actual: 19.6

Predicted: 22.40237924315549
Actual: 22.6



Save the trained model

In [18]:
import joblib
joblib.dump(model, "./model.pkl")

['./model.pkl']