In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_parquet('Dataset_Eindhoven.parquet')

In [None]:
# Sort the dataframe by 'Location' and 'Time_UTC' in ascending order
df = df.sort_values(by=['Location', 'Time_UTC'])

# Forward fill missing values in 'Lat' and 'Lon' columns within each location
df[['Lat', 'Lon']] = df.groupby('Location')[['Lat', 'Lon']].transform(lambda x: x.ffill())

# If there are still remaining null values after forward filling, you can fill them with the last available value
df[['Lat', 'Lon']] = df.groupby('Location')[['Lat', 'Lon']].transform(lambda x: x.fillna(method='bfill').fillna(method='ffill'))

# Check if there are any remaining null values
print(df.isnull().sum())

In [None]:
df = df.sort_values(by='Time_Local')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Assuming 'PM10' is the column containing PM10 measurements in your DataFrame
pm10_values = df['PM10']

plt.figure(figsize=(10, 6))
sns.boxplot(x=pm10_values, color='skyblue')

plt.title('Box Plot of PM10 Measurements')
plt.xlabel('PM10 Values')

# Show the plot
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Select data for the target location (e.g., "I02")
target_location = "I02"
df_target_location = df[df['Location'] == target_location]

# Extract numerical columns for training
X_train = df_target_location.select_dtypes(include=['float64', 'int64', 'float32']).drop(['PM10'], axis=1)
y_train = df_target_location['PM10']

# Use other locations for testing
df_other_locations = df[df['Location'] != target_location]
X_test = df_other_locations.select_dtypes(include=['float64', 'int64', 'float32']).drop(['PM10'], axis=1)
y_test = df_other_locations['PM10']

# Now, X_train and y_train contain data from "I02" for training,
# and X_test and y_test contain data from other locations for testing

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the performance
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

# Print evaluation metrics
print(f'Mean Squared Error (Random Forest): {mse_rf:.2f}')
print(f'Mean Absolute Error (Random Forest): {mae_rf:.2f}')
print(f'R-squared (Random Forest): {r2_rf:.2f}')
print(f'Root Mean Squared Error (Random Forest): {rmse_rf:.2f}')

# Optionally, you can visualize the predicted vs. actual values
plt.scatter(y_test, y_pred_rf)
plt.xlabel('Actual PM10 Values')
plt.ylabel('Predicted PM10 Values')
plt.title('Random Forest: Actual vs. Predicted PM10 Values')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create a Decision Tree model
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
decision_tree_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_decision_tree = decision_tree_model.predict(X_test)

# Evaluate the performance
mse = mean_squared_error(y_test, y_pred_decision_tree)
mae = mean_absolute_error(y_test, y_pred_decision_tree)
r2 = r2_score(y_test, y_pred_decision_tree)
rmse = np.sqrt(mse)

# Print evaluation metrics
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'R-squared (R2): {r2:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')

# Optionally, you can visualize the predicted vs. actual values
plt.scatter(y_test, y_pred_decision_tree)
plt.xlabel('Actual PM10 Values')
plt.ylabel('Predicted PM10 Values')
plt.title('Decision Tree: Actual vs. Predicted PM10 Values')
plt.show()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create an Extra Trees model
extra_trees_model = ExtraTreesRegressor(random_state=42)

# Train the model on the training data
extra_trees_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_extra_trees = extra_trees_model.predict(X_test)

# Evaluate the performance
mse = mean_squared_error(y_test, y_pred_extra_trees)
mae = mean_absolute_error(y_test, y_pred_extra_trees)
r2 = r2_score(y_test, y_pred_extra_trees)
rmse = np.sqrt(mse)

# Print evaluation metrics
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'R-squared (R2): {r2:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')

# Optionally, you can visualize the predicted vs. actual values
plt.scatter(y_test, y_pred_extra_trees)
plt.xlabel('Actual PM10 Values')
plt.ylabel('Predicted PM10 Values')
plt.title('Extra Trees: Actual vs. Predicted PM10 Values')
plt.show()

In [None]:
unique_locations_count = df['Location'].nunique()
unique_locations_ids = df['Location'].unique()

# Print the count of unique locations
print(f"Count of Unique Locations: {unique_locations_count}")

# Print the unique location IDs
print("Unique Location IDs:")
for location_id in unique_locations_ids:
    print(location_id)