In [28]:
import hopsworks
from hsfs.client.exceptions import RestAPIError

# Establish connection to Hopsworks
project = hopsworks.login(
    api_key_value="dYTVrTVvbj6Qw82i.YGKHdS9snQYFgOADJIvLdvZ2n2S5BxIAOtvPUEmAyd56bvaG6xhhGyNM3nYbexaP"
)

# Access the Feature Store
fs = project.get_feature_store()

# Specify the feature group and its version
feature_group_name = "final_df_feature_group"
feature_group_version = 1

try:
    # Retrieve the 'final_df_feature_group' feature group by its name and version
    final_df_fg = fs.get_feature_group(feature_group_name, version=feature_group_version)
    
    # Read the feature group as a Pandas DataFrame
    final_df = final_df_fg.read()

    # Print success message and display the first few rows of the DataFrame
    print(f"Downloaded feature group: {feature_group_name} (version {feature_group_version})")
    print(final_df.head())  # Optionally display the first few rows of the DataFrame

except RestAPIError as e:
    print(f"Error downloading feature group: {feature_group_name} (version {feature_group_version})")
    print(e)

# Now you have the `final_df` DataFrame containing data from 'final_df_feature_group'


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1044630
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.67s) 
Downloaded feature group: final_df_feature_group (version 1)
   truck_id    route_id            departure_date         estimated_arrival  \
0  16965717  R-0e4bf1f5 2019-02-03 06:00:00+00:00 2019-02-03 12:00:00+00:00   
1  17709480  R-05e62825 2019-01-01 06:00:00+00:00 2019-01-06 12:00:00+00:00   
2  20339561  R-3d1310d5 2019-01-04 06:00:00+00:00 2019-01-05 00:00:00+00:00   
3  19424012  R-f56dfb6f 2019-02-02 06:00:00+00:00 2019-02-04 12:00:00+00:00   
4  17260588  R-405658e2 2019-01-25 06:00:00+00:00 2019-01-26 06:00:00+00:00   

   route_avg_temp  route_avg_wind_speed  route_avg_precip  route_avg_humidity  \
0       65.500000              8.500000               0.0           95.0

In [29]:
final_df.shape

(10290, 49)

In [30]:
final_df.dtypes

truck_id                                        int64
route_id                                       object
departure_date                    datetime64[us, UTC]
estimated_arrival                 datetime64[us, UTC]
route_avg_temp                                float64
route_avg_wind_speed                          float64
route_avg_precip                              float64
route_avg_humidity                            float64
route_avg_visibility                          float64
route_avg_pressure                            float64
route_description                              object
estimated_arrival_nearest_hour    datetime64[us, UTC]
departure_date_nearest_hour       datetime64[us, UTC]
origin_id                                      object
destination_id                                 object
distance                                      float64
average_hours                                 float64
temp_origin                                   float64
wind_speed_origin           

In [31]:
final_df.isnull().sum()

truck_id                          0
route_id                          0
departure_date                    0
estimated_arrival                 0
route_avg_temp                    0
route_avg_wind_speed              0
route_avg_precip                  0
route_avg_humidity                0
route_avg_visibility              0
route_avg_pressure                0
route_description                 0
estimated_arrival_nearest_hour    0
departure_date_nearest_hour       0
origin_id                         0
destination_id                    0
distance                          0
average_hours                     0
temp_origin                       0
wind_speed_origin                 0
description_origin                0
precip_origin                     0
humidity_origin                   0
visibility_origin                 0
pressure_origin                   0
temp_destination                  0
wind_speed_destination            0
description_destination           0
precip_destination          

In [32]:
cts_cols = ['route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip', 'route_avg_humidity', 
                'route_avg_visibility', 'route_avg_pressure', 'distance', 'average_hours',
                'temp_origin', 'wind_speed_origin', 'precip_origin', 'humidity_origin', 
                'visibility_origin', 'pressure_origin', 'temp_destination', 'wind_speed_destination',
                'precip_destination', 'humidity_destination', 'visibility_destination',
                'pressure_destination', 'avg_no_of_vehicles', 'truck_age', 'load_capacity_pounds', 
                'mileage_mpg', 'age', 'experience', 'average_speed_mph']

cat_cols = ['route_description', 'description_origin', 'description_destination', 'accident', 
                'fuel_type', 'gender', 'driving_style', 'ratings', 'is_midnight']

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split

class Phase2Pipeline:
    
    # Define cts_cols, cat_cols, and target at the class level
    cts_cols = ['route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip', 'route_avg_humidity', 
                'route_avg_visibility', 'route_avg_pressure', 'distance', 'average_hours',
                'temp_origin', 'wind_speed_origin', 'precip_origin', 'humidity_origin', 
                'visibility_origin', 'pressure_origin', 'temp_destination', 'wind_speed_destination',
                'precip_destination', 'humidity_destination', 'visibility_destination',
                'pressure_destination', 'avg_no_of_vehicles', 'truck_age', 'load_capacity_pounds', 
                'mileage_mpg', 'age', 'experience', 'average_speed_mph']

    cat_cols = ['route_description', 'description_origin', 'description_destination', 'accident', 
                'fuel_type', 'gender', 'driving_style', 'ratings', 'is_midnight']

    target = ['delay']
    
    def __init__(self, feature_store, feature_group_name):
        self.feature_store = feature_store
        self.feature_group_name = feature_group_name

    def load_final_df(self):
        # Load final DataFrame from feature store
        fg = self.feature_store.get_feature_group(name=self.feature_group_name, version=1)
        final_df = fg.read()
        return final_df

    def handle_null_values(self, df):
        # Fill null values in continuous columns with the median
        df[self.cts_cols] = df[self.cts_cols].fillna(df[self.cts_cols].median())

        # Fill nulls in categorical columns
        df[self.cat_cols] = df[self.cat_cols].fillna(df[self.cat_cols].mode().iloc[0])

        return df

    def train_validation_test_split(self, df):
        # Remove ID columns if any
        df = df.drop(columns=['unique_id', 'truck_id', 'route_id'])

        # Split features and target
        X = df[self.cts_cols + self.cat_cols]
        y = df[self.target]

        # Split data into train, validation, and test sets
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

        return X_train, X_val, X_test, y_train, y_val, y_test

    def execute_pipeline(self):
        # Load the final DataFrame
        final_df = self.load_final_df()

        # Handle null values
        final_df_clean = self.handle_null_values(final_df)

        # Split the data
        X_train, X_val, X_test, y_train, y_val, y_test = self.train_validation_test_split(final_df_clean)

        return X_train, X_val, X_test, y_train, y_val, y_test

# Example usage
project = hopsworks.login(
    api_key_value="dYTVrTVvbj6Qw82i.YGKHdS9snQYFgOADJIvLdvZ2n2S5BxIAOtvPUEmAyd56bvaG6xhhGyNM3nYbexaP"
)
fs = project.get_feature_store()

pipeline = Phase2Pipeline(fs, "final_df_feature_group")
X_train, X_val, X_test, y_train, y_val, y_test = pipeline.execute_pipeline()


Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1044630
Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.73s) 


In [34]:
X_train

Unnamed: 0,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,route_avg_visibility,route_avg_pressure,distance,average_hours,temp_origin,wind_speed_origin,...,average_speed_mph,route_description,description_origin,description_destination,accident,fuel_type,gender,driving_style,ratings,is_midnight
10054,82.000000,4.000000,0.125,84.500000,3.000000,1011.250000,1745.83,34.92,28.0,6.0,...,64.82,Moderate or heavy rain shower,Partly cloudy,Clear,1,diesel,male,proactive,5,1
2755,62.666667,4.666667,0.000,63.000000,6.000000,1014.666667,296.62,5.93,23.0,4.0,...,40.50,Sunny,Cloudy,Partly cloudy,1,diesel,male,conservative,4,0
5809,37.000000,7.000000,0.000,79.000000,6.000000,1022.000000,400.78,8.02,32.0,7.0,...,49.30,Clear,Sunny,Clear,1,diesel,male,conservative,8,0
7497,31.333333,13.666667,0.000,65.666667,5.333333,1017.666667,433.01,8.66,18.0,11.0,...,43.37,Patchy light snow,Overcast,Sunny,0,diesel,male,conservative,6,0
1334,52.500000,9.250000,0.000,68.250000,6.000000,1016.500000,775.88,15.52,-8.0,6.0,...,49.68,Sunny,Partly cloudy,Light snow,0,diesel,male,conservative,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,56.000000,9.250000,0.000,79.000000,6.000000,1013.500000,691.51,13.83,50.0,2.0,...,65.04,Overcast,Overcast,Overcast,1,diesel,male,proactive,8,1
5191,59.000000,11.333333,0.000,66.333333,6.000000,1014.000000,370.26,7.41,27.0,7.0,...,62.66,Clear,Partly cloudy,Clear,0,diesel,male,proactive,2,0
5390,48.000000,11.500000,0.000,85.166667,3.166667,1009.333333,1243.05,24.86,21.0,13.0,...,59.12,Light rain shower,Cloudy,Sunny,1,diesel,male,proactive,8,1
860,56.500000,11.750000,0.000,65.500000,6.000000,1008.000000,830.87,16.62,36.0,1.0,...,44.32,Cloudy,Sunny,Partly cloudy,1,diesel,male,conservative,5,1


In [35]:
X_val

Unnamed: 0,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,route_avg_visibility,route_avg_pressure,distance,average_hours,temp_origin,wind_speed_origin,...,average_speed_mph,route_description,description_origin,description_destination,accident,fuel_type,gender,driving_style,ratings,is_midnight
1897,56.666667,2.500000,0.000000,82.833333,6.000000,1019.333333,1323.45,26.47,37.0,8.0,...,62.89,Clear,Partly cloudy,Sunny,1,diesel,male,proactive,4,1
7213,71.500000,7.000000,0.050000,82.250000,3.750000,1012.500000,587.51,11.75,23.0,9.0,...,64.00,Overcast,Sunny,Partly cloudy,1,gas,male,proactive,3,1
10006,70.903226,7.741935,0.003226,86.193548,5.774194,1012.806452,955.69,19.11,36.0,1.0,...,63.74,Patchy rain possible,Sunny,Overcast,1,diesel,male,proactive,4,1
5317,72.200000,8.200000,0.000000,60.800000,6.000000,1020.200000,945.40,18.91,34.0,4.0,...,45.94,Sunny,Sunny,Light rain shower,1,diesel,male,conservative,7,1
8486,57.666667,8.000000,0.000000,89.666667,6.000000,1014.666667,439.84,8.80,25.0,9.0,...,57.72,Light rain,Light snow,Overcast,0,gas,male,proactive,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,71.250000,7.250000,0.000000,46.500000,6.000000,1011.250000,755.32,15.11,34.0,12.0,...,45.73,Clear,Sunny,Clear,1,gas,male,conservative,8,1
637,71.000000,6.666667,0.000000,77.333333,6.000000,1018.666667,429.54,8.59,34.0,4.0,...,49.45,Sunny,Partly cloudy,Clear,0,diesel,male,conservative,9,0
3132,70.600000,11.400000,0.000000,55.200000,4.800000,1010.800000,926.26,18.53,0.0,19.0,...,57.85,Patchy light drizzle,Overcast,Partly cloudy,1,diesel,male,proactive,6,1
5852,55.500000,4.500000,0.000000,71.500000,6.000000,1022.000000,439.00,8.78,72.0,13.0,...,57.78,Patchy rain possible,Partly cloudy,Clear,0,diesel,male,proactive,4,0


In [36]:
X_test

Unnamed: 0,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,route_avg_visibility,route_avg_pressure,distance,average_hours,temp_origin,wind_speed_origin,...,average_speed_mph,route_description,description_origin,description_destination,accident,fuel_type,gender,driving_style,ratings,is_midnight
3381,67.000000,12.000000,0.0,55.500000,6.000000,1014.000000,167.88,3.36,14.0,9.0,...,39.53,Light rain,Overcast,Sunny,1,gas,male,conservative,6,0
4639,47.285714,6.428571,0.0,80.000000,4.428571,1021.000000,1668.64,33.37,66.0,5.0,...,46.38,Patchy heavy snow,Partly cloudy,Clear,1,diesel,male,conservative,7,1
379,41.000000,7.500000,0.0,93.000000,5.000000,1014.000000,36.05,0.72,54.0,12.0,...,46.12,Light rain,Patchy rain possible,Sunny,0,gas,male,conservative,4,0
9683,66.000000,7.250000,0.0,75.250000,5.500000,1011.750000,809.66,16.19,32.0,7.0,...,43.75,Patchy rain possible,Sunny,Clear,1,diesel,male,conservative,7,1
9482,77.000000,3.500000,0.0,76.500000,6.000000,1015.500000,132.59,2.65,30.0,4.0,...,59.33,Patchy rain possible,Overcast,Sunny,0,gas,female,proactive,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6884,50.250000,11.250000,0.0,54.750000,6.000000,1013.875000,2039.82,40.80,41.0,6.0,...,51.98,Sunny,Sunny,Clear,1,gas,male,conservative,4,1
6076,70.000000,4.750000,0.0,53.500000,6.000000,1011.750000,97.81,1.96,30.0,4.0,...,59.91,Patchy rain possible,Sunny,Cloudy,1,diesel,male,proactive,9,0
4772,45.400000,5.400000,0.0,86.000000,2.800000,1019.600000,988.10,19.76,30.0,11.0,...,42.49,Fog,Overcast,Sunny,1,gas,male,conservative,8,1
586,65.500000,5.250000,0.0,91.000000,6.000000,1011.500000,675.30,13.51,30.0,12.0,...,58.67,Clear,Overcast,Partly cloudy,1,diesel,male,proactive,2,1


In [37]:
y_train.head()


Unnamed: 0,delay
10054,0
2755,0
5809,0
7497,0
1334,0


In [38]:
y_test.head()


Unnamed: 0,delay
3381,0
4639,0
379,0
9683,0
9482,0


In [39]:
y_val.head()

Unnamed: 0,delay
1897,0
7213,0
10006,0
5317,0
8486,0


In [40]:
# Assuming final_merge is already defined and contains the data
min_date = final_df['estimated_arrival'].min()
max_date = final_df['estimated_arrival'].max()

print("Estimated Arrival Date Range:")
print(f"Min: {min_date}")
print(f"Max: {max_date}")


Estimated Arrival Date Range:
Min: 2019-01-01 12:00:00+00:00
Max: 2019-02-14 18:00:00+00:00


In [41]:
cat_cols

['route_description',
 'description_origin',
 'description_destination',
 'accident',
 'fuel_type',
 'gender',
 'driving_style',
 'ratings',
 'is_midnight']

In [42]:
cts_cols

['route_avg_temp',
 'route_avg_wind_speed',
 'route_avg_precip',
 'route_avg_humidity',
 'route_avg_visibility',
 'route_avg_pressure',
 'distance',
 'average_hours',
 'temp_origin',
 'wind_speed_origin',
 'precip_origin',
 'humidity_origin',
 'visibility_origin',
 'pressure_origin',
 'temp_destination',
 'wind_speed_destination',
 'precip_destination',
 'humidity_destination',
 'visibility_destination',
 'pressure_destination',
 'avg_no_of_vehicles',
 'truck_age',
 'load_capacity_pounds',
 'mileage_mpg',
 'age',
 'experience',
 'average_speed_mph']

In [43]:
# Convert 'estimated_arrival' to timezone-naive if it has a timezone (remove the timezone info)
final_df['estimated_arrival'] = final_df['estimated_arrival'].dt.tz_localize(None)

# Splitting the data based on 'estimated_arrival' date
train_df = final_df[final_df['estimated_arrival'] <= pd.to_datetime('2019-01-30')]

validation_df = final_df[(final_df['estimated_arrival'] > pd.to_datetime('2019-01-30')) & 
                            (final_df['estimated_arrival'] <= pd.to_datetime('2019-02-07'))]

test_df = final_df[final_df['estimated_arrival'] > pd.to_datetime('2019-02-07')]

# Defining features and target variables
X_train = train_df[cts_cols + cat_cols]
y_train = train_df['delay']

X_valid = validation_df[cts_cols + cat_cols]
y_valid = validation_df['delay']

X_test = test_df[cts_cols + cat_cols]
y_test = test_df['delay']


In [44]:
from sklearn.preprocessing import OneHotEncoder

# Categorical columns to be encoded
encode_columns = ['route_description', 'description_origin', 'description_destination', 
                  'fuel_type', 'gender', 'driving_style']

# Initializing the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

# Fitting the encoder on the train set and transforming the train, validation, and test sets
encoder.fit(X_train[encode_columns])

# Generate the names for the new encoded features
encoded_features = list(encoder.get_feature_names_out(encode_columns))

# Transforming and creating the new encoded columns for train, validation, and test sets
X_train_encoded = pd.DataFrame(encoder.transform(X_train[encode_columns]), columns=encoded_features, index=X_train.index)
X_valid_encoded = pd.DataFrame(encoder.transform(X_valid[encode_columns]), columns=encoded_features, index=X_valid.index)
X_test_encoded = pd.DataFrame(encoder.transform(X_test[encode_columns]), columns=encoded_features, index=X_test.index)

# Concatenating the new one-hot encoded features with the original datasets
X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_valid = pd.concat([X_valid, X_valid_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# Dropping the original categorical columns
X_train = X_train.drop(encode_columns, axis=1)
X_valid = X_valid.drop(encode_columns, axis=1)
X_test = X_test.drop(encode_columns, axis=1)

# Print a quick summary of the transformation
print(f"X_train shape after encoding: {X_train.shape}")
print(f"X_valid shape after encoding: {X_valid.shape}")
print(f"X_test shape after encoding: {X_test.shape}")




X_train shape after encoding: (6859, 99)
X_valid shape after encoding: (1953, 99)
X_test shape after encoding: (1478, 99)


In [45]:
from sklearn.preprocessing import StandardScaler

# List of continuous numerical columns to scale
numerical_columns = [
    'route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip', 'route_avg_humidity', 
    'route_avg_visibility', 'route_avg_pressure', 'distance', 'average_hours',
    'temp_origin', 'wind_speed_origin', 'precip_origin', 'humidity_origin', 
    'visibility_origin', 'pressure_origin', 'temp_destination', 'wind_speed_destination',
    'precip_destination', 'humidity_destination', 'visibility_destination', 
    'pressure_destination', 'avg_no_of_vehicles', 'truck_age', 'load_capacity_pounds', 
    'mileage_mpg', 'age', 'experience', 'average_speed_mph'
]

In [46]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [47]:
# Fit the scaler on the training set and then transform the train, validation, and test sets
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_valid[numerical_columns] = scaler.transform(X_valid[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [48]:
# Print the shapes of the datasets after scaling
print(f"X_train shape after scaling: {X_train.shape}")
print(f"X_valid shape after scaling: {X_valid.shape}")
print(f"X_test shape after scaling: {X_test.shape}")

X_train shape after scaling: (6859, 99)
X_valid shape after scaling: (1953, 99)
X_test shape after scaling: (1478, 99)


In [49]:
X_train

Unnamed: 0,route_avg_temp,route_avg_wind_speed,route_avg_precip,route_avg_humidity,route_avg_visibility,route_avg_pressure,distance,average_hours,temp_origin,wind_speed_origin,...,description_destination_Partly cloudy,description_destination_Patchy light rain,description_destination_Patchy light snow,description_destination_Patchy rain possible,description_destination_Patchy snow possible,description_destination_Sunny,description_destination_Thundery outbreaks possible,fuel_type_gas,gender_male,driving_style_proactive
1,1.021632,0.905185,-0.296268,0.310013,0.616734,-0.075490,-0.745866,-0.746022,-0.925722,-1.389234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.800060,1.453299,-0.296268,0.082827,0.616734,0.346483,-0.393383,-0.393405,-0.037754,0.288955,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.167528,1.767151,-0.296268,-1.340765,0.616734,-0.355433,0.190450,0.190763,-2.202176,-1.389234,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
6,0.087622,0.297003,-0.296268,0.754984,-1.902036,-1.012064,-0.190811,-0.190964,-1.813690,-0.430269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
9,1.037539,0.599843,-0.296268,-0.187184,0.616734,0.308746,-0.495410,-0.495287,1.294198,1.967144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10282,0.366169,-1.225060,-0.296268,1.011043,-0.390774,-1.578125,1.742460,1.742148,-0.148750,-0.190528,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
10283,0.312602,-0.941885,-0.296268,1.513314,0.616734,-0.672427,-0.138375,-0.138700,-2.035682,1.008179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
10284,0.875054,-0.115960,-0.296268,0.410288,0.616734,-0.559215,-0.651209,-0.651418,-0.925722,1.008179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
10287,0.747565,0.247448,-0.296268,-0.430769,0.616734,0.187986,0.288812,0.288676,-0.315244,0.049214,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [50]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Split the data
X = X_train.copy()
y = y_train.copy()

# Perform a train-test split for additional testing
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define a function to log experiments with MLflow
def log_model_with_mlflow(model_name, model, params, X_train, y_train, X_valid, y_valid):
    with mlflow.start_run(run_name=model_name):
        # Fit the model
        model.fit(X_train, y_train)

        # Predict on the validation set
        y_pred_valid = model.predict(X_valid)

        # Evaluate metrics
        accuracy = accuracy_score(y_valid, y_pred_valid)
        precision = precision_score(y_valid, y_pred_valid, average='weighted')
        recall = recall_score(y_valid, y_pred_valid, average='weighted')
        f1 = f1_score(y_valid, y_pred_valid, average='weighted')

        # Log parameters, metrics, and model
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.sklearn.log_model(model, model_name)

        # Print performance
        print(f"{model_name} Performance:")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")
        print(classification_report(y_valid, y_pred_valid))

# Example usage:
# from sklearn.ensemble import RandomForestClassifier
# params = {'n_estimators': 100, 'max_depth': 10}
# model = RandomForestClassifier(**params)
# log_model_with_mlflow("Random Forest", model, params, X_train, y_train, X_valid, y_valid)


In [56]:
# Hyperparameter tuning for each model using GridSearchCV
def tune_and_train_model(model_name, model, param_grid, X_train, y_train, X_valid, y_valid):
    # Use GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Log the best model with MLflow
    log_model_with_mlflow(model_name, best_model, best_params, X_train, y_train, X_valid, y_valid)

# Logistic Regression
lr_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1.0, 10.0],
    'solver': ['liblinear']
}

lr_model = LogisticRegression()
tune_and_train_model("Logistic Regression", lr_model, lr_param_grid, X_train_split, y_train_split, X_test_split, y_test_split)

# Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_model = RandomForestClassifier()
tune_and_train_model("Random Forest", rf_model, rf_param_grid, X_train_split, y_train_split, X_test_split, y_test_split)

# XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0]
}

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
tune_and_train_model("XGBoost", xgb_model, xgb_param_grid, X_train_split, y_train_split, X_test_split, y_test_split)



Logistic Regression Performance:
Accuracy: 0.706268221574344
Precision: 0.6906550711305344
Recall: 0.706268221574344
F1 Score: 0.6553543310826062
              precision    recall  f1-score   support

           0       0.71      0.94      0.81       925
           1       0.64      0.22      0.33       447

    accuracy                           0.71      1372
   macro avg       0.68      0.58      0.57      1372
weighted avg       0.69      0.71      0.66      1372





Random Forest Performance:
Accuracy: 0.7704081632653061
Precision: 0.7679321258794528
Recall: 0.7704081632653061
F1 Score: 0.7502738075184787
              precision    recall  f1-score   support

           0       0.77      0.93      0.85       925
           1       0.76      0.44      0.55       447

    accuracy                           0.77      1372
   macro avg       0.76      0.68      0.70      1372
weighted avg       0.77      0.77      0.75      1372



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.


Parameters: { "use_label_encoder" } are not used.






XGBoost Performance:
Accuracy: 0.7674927113702624
Precision: 0.761179367999715
Recall: 0.7674927113702624
F1 Score: 0.750983700489618
              precision    recall  f1-score   support

           0       0.78      0.92      0.84       925
           1       0.73      0.46      0.56       447

    accuracy                           0.77      1372
   macro avg       0.75      0.69      0.70      1372
weighted avg       0.76      0.77      0.75      1372

