## RandomForestRegressor

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error


In [25]:
# Check the data types of all columns
data_types = df.dtypes
print(data_types)


STATISTIC Label       object
Year                   int64
Age Group             object
Sex                   object
UNIT                  object
VALUE                float64
Population Change    float64
dtype: object


In [26]:
# Drop rows with missing values in the target column
df.dropna(subset=['VALUE'], inplace=True)

In [27]:
# Perform one-hot encoding for categorical columns
df_encoded = pd.get_dummies(df, columns=['Age Group', 'Sex', 'UNIT'], drop_first=True)


In [28]:
# Split the data into features (X) and the target variable (y)
X = df_encoded.drop('VALUE', axis=1)
y = df_encoded['VALUE']

In [29]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
# Drop non-numeric columns
non_numeric_columns = ['STATISTIC Label']
X_train = X_train.drop(columns=non_numeric_columns)
X_test = X_test.drop(columns=non_numeric_columns)


In [31]:
# Create a Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)

In [32]:
# Hyperparameter Tuning with GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [33]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)


In [34]:
# Get the best hyperparameters
best_params = grid_search.best_params_

In [35]:
# Create a new Random Forest Regressor with the best hyperparameters
best_rf_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42
)

In [36]:
# Train the model on the training data
best_rf_model.fit(X_train, y_train)

In [37]:
# Make predictions on the test data
predictions = best_rf_model.predict(X_test)


In [38]:
# Evaluate the model using Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 6.685982142857157


In [39]:
from sklearn.metrics import mean_absolute_error


# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error (MAE): {mae}')


Mean Absolute Error (MAE): 6.685982142857157


In [40]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f'Mean squared Error (MAE): {mse}')


Mean squared Error (MAE): 1312.6636004999968


In [41]:
import numpy as np
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'mean_squared_error: {rmse}')


mean_squared_error: 36.230699696528035


In [42]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predictions)
print(f'r2_score: {r2}')



r2_score: 0.9958256938779285
