In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tempfile
import os

In [2]:
# load the data
def load_data(path : str) -> pd.DataFrame:
    current_dir = os.path.join(os.getcwd())
    parent_dir = os.path.dirname(current_dir)
    data_path = os.path.join(parent_dir, path)

    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Data file not found at {data_path}")
    
    df = pd.read_csv(data_path)
    return df

    

In [3]:
# data loading
df = load_data('data/raw/weather_data.csv')

In [5]:
# weather description count
weather_code_map = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Fog",
    48: "Depositing rime fog",
    51: "Light drizzle",
    53: "Moderate drizzle",
    55: "Dense drizzle",
    56: "Light freezing drizzle",
    57: "Dense freezing drizzle",
    61: "Slight rain",
    63: "Moderate rain",
    65: "Heavy rain",
    66: "Light freezing rain",
    67: "Heavy freezing rain",
    71: "Slight snowfall",
    73: "Moderate snowfall",
    75: "Heavy snowfall",
    77: "Snow grains",
    80: "Slight rain showers",
    81: "Moderate rain showers",
    82: "Violent rain showers",
    85: "Slight snow showers",
    86: "Heavy snow showers",
    95: "Slight or moderate thunderstorm",
    96: "Thunderstorm with slight hail",
    99: "Thunderstorm with heavy hail"
}

In [6]:
# select relevant features for temprature classification
features = [
    "time",
    'temperature_2m',
    "relative_humidity_2m",
    "dew_point_2m",
    "apparent_temperature",
    "precipitation",
    "pressure_msl",
    "cloudcover",
    "cloudcover_low",
    "cloudcover_mid",
    "cloudcover_high",
    "windspeed_10m",
    "windgusts_10m",
    "winddirection_10m",
    "sunshine_duration",
    "shortwave_radiation",
    "diffuse_radiation",
    "direct_radiation",
    "terrestrial_radiation",
]

target = df['weather_code'].map(weather_code_map).values

main_df = df[features]

main_df.insert(1, 'weather_code_map', target)

In [7]:
# precipitation drop
main_df.drop('precipitation', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df.drop('precipitation', axis=1, inplace=True)


In [8]:
main_df['weather_code_binary'] = main_df['weather_code_map'].map({
    'Clear sky': 0,
    'Overcast': 1,
    'Slight rain': 2,
    'Moderate rain': 2,
    'Heavy rain': 2

})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df['weather_code_binary'] = main_df['weather_code_map'].map({


In [None]:
df_binary = main_df[main_df['weather_code_map'].isin(['Clear sky', 'Overcast', 'Slight rain', 'Moderate rain', 'Heavy rain'])].copy()
df_binary['weather_code_binary'] = df_binary['weather_code_map'].map({
    'Clear sky': 0,
    'Overcast': 1,
    'Slight rain': 2,
    'Moderate rain': 2,
    'Heavy rain': 2

})



In [10]:
# rem terrestrial_radiation, sunshine_duration, shortwave_radiation
df_binary.drop(columns=['terrestrial_radiation', 'sunshine_duration', 'shortwave_radiation'], inplace=True)

In [11]:
# remove the 'weather_code_map' column
df_binary.drop(columns=['weather_code_map'], inplace=True)

In [12]:
# convert time to datetime
df_binary['time'] = pd.to_datetime(df_binary['time'])

In [13]:
# extract date, time, month, year, hour, minute
df_binary['weekday'] = df_binary['time'].dt.weekday
df_binary['times'] = df_binary['time'].dt.time
df_binary['month'] = df_binary['time'].dt.month
df_binary['year'] = df_binary['time'].dt.year
df_binary['hour'] = df_binary['time'].dt.hour
df_binary['minute'] = df_binary['time'].dt.minute

In [14]:
df_binary.head()

Unnamed: 0,time,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,pressure_msl,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,...,winddirection_10m,diffuse_radiation,direct_radiation,weather_code_binary,weekday,times,month,year,hour,minute
0,2025-01-01 00:00:00+00:00,13.4,99.0,13.2,13.3,1014.8,0.0,0.0,0.0,0.0,...,301.0,0.0,0.0,0,2,00:00:00,1,2025,0,0
1,2025-01-01 01:00:00+00:00,13.3,99.0,13.1,13.4,1016.0,93.0,93.0,0.0,0.0,...,304.0,1.0,0.0,1,2,01:00:00,1,2025,1,0
2,2025-01-01 02:00:00+00:00,14.4,96.0,13.8,14.3,1016.9,100.0,100.0,0.0,0.0,...,313.0,43.0,10.0,1,2,02:00:00,1,2025,2,0
3,2025-01-01 03:00:00+00:00,15.9,88.0,13.9,15.5,1017.8,90.0,90.0,0.0,0.0,...,319.0,124.0,24.0,1,2,03:00:00,1,2025,3,0
5,2025-01-01 05:00:00+00:00,19.5,75.0,15.0,20.1,1017.2,4.0,4.0,0.0,0.0,...,324.0,164.0,304.0,0,2,05:00:00,1,2025,5,0


In [15]:
df_binary.drop('time', axis=1, inplace=True)

In [16]:
df_binary.drop('times', axis=1, inplace=True)

In [17]:
X = df_binary.drop('weather_code_binary', axis=1)
y = df_binary['weather_code_binary']

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  StandardScaler

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=3, stratify=y)

In [20]:
scale = StandardScaler()
X_train_scale = scale.fit_transform(X_train)
X_test_scale = scale.transform(X_test)

In [32]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models import infer_signature
import os
import mlflow

In [33]:
mlflow.set_tracking_uri("http://34.227.105.107:5000/")
mlflow.set_experiment("Experiment with different models")


# Define algorithms
algorithms = {
    'LogisticRegression': LogisticRegression(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [36]:
# Start the parent run
with mlflow.start_run(run_name="All Experiments") as parent_run:
    # Loop through algorithms and feature extraction methods (Child Runs)
    for algo_name, algorithm in algorithms.items():
        with mlflow.start_run(run_name=f"{algo_name}", nested=True) as child_run:
            
            X_train, X_test, y_train, y_test = X_train_scale, X_test_scale, y_train, y_test
            

            # Log preprocessing parameters
            mlflow.log_param("algorithm", algo_name)
            mlflow.log_param("test_size", 0.2)
            
            # Model training
            model = algorithm
            model.fit(X_train, y_train)
        
            # Log model parameters
            if algo_name == 'LogisticRegression':
                mlflow.log_param("C", model.C)
            elif algo_name == 'MultinomialNB':
                mlflow.log_param("alpha", model.alpha)
            elif algo_name == 'XGBoost':
                mlflow.log_param("n_estimators", model.n_estimators)
                mlflow.log_param("learning_rate", model.learning_rate)
            elif algo_name == 'RandomForest':
                mlflow.log_param("n_estimators", model.n_estimators)
                mlflow.log_param("max_depth", model.max_depth)
            elif algo_name == 'GradientBoosting':
                mlflow.log_param("n_estimators", model.n_estimators)
                mlflow.log_param("learning_rate", model.learning_rate)
                mlflow.log_param("max_depth", model.max_depth)
            
            # Model evaluation
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')
            
            # Log evaluation metrics
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)
        
            # Save and log the notebook
            mlflow.log_artifact('experiments3.ipynb')

            # Save the model
            # mlflow.sklearn.save_model(model, "model")

            with tempfile.TemporaryDirectory() as tmp_dir:
                model_path = os.path.join(tmp_dir, "model")
                mlflow.sklearn.save_model(model, model_path)
                mlflow.log_artifacts(model_path, "model")
            
            # Print the results for verification
            print(f"Algorithm: {algo_name}")
            print(f"Accuracy: {accuracy}")
            print(f"Precision: {precision}")
            print(f"Recall: {recall}")
            print(f"F1 Score: {f1}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Algorithm: LogisticRegression
Accuracy: 0.9542963535445795
Precision: 0.8965139491051085
Recall: 0.8504851077527255
F1 Score: 0.868762168720644
🏃 View run LogisticRegression at: http://34.227.105.107:5000/#/experiments/995874154406033238/runs/b09ac7555e174a10b2a41842e99896eb
🧪 View experiment at: http://34.227.105.107:5000/#/experiments/995874154406033238
Algorithm: XGBoost
Accuracy: 0.9735122602402867
Precision: 0.9381062072692016
Recall: 0.9210629718983268
F1 Score: 0.9290666125959365
🏃 View run XGBoost at: http://34.227.105.107:5000/#/experiments/995874154406033238/runs/95bf2d469c8f470db78871938d6ce465
🧪 View experiment at: http://34.227.105.107:5000/#/experiments/995874154406033238
Algorithm: RandomForest
Accuracy: 0.9720016862221598
Precision: 0.9434056780883404
Recall: 0.9059346388235582
F1 Score: 0.9221988227783154
🏃 View run RandomForest at: http://34.227.105.107:5000/#/experiments/995874154406033238/runs/6ff0466a1c8c4f8ca0b51976d3192752
🧪 View experiment at: http://34.227.105.