In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Data Splitting:
# Split the data into training and testing sets. Our target variable is silica_concentrate, located in the last column of the dataset.
# This script will produce 4 datasets (X_test, X_train, y_test, y_train) that you can store in data/processed.

df = pd.read_csv(os.path.join(os.getcwd(), '../data/raw_data/raw.csv'), index_col=0)

X = df.drop(columns=['silica_concentrate'])
y = df['silica_concentrate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.to_csv(os.path.join(os.getcwd(), '../data/processed_data/X_train.csv'), index=False)
X_test.to_csv(os.path.join(os.getcwd(), '../data/processed_data/X_test.csv'), index=False)

y_train.to_csv(os.path.join(os.getcwd(), '../data/processed_data/y_train.csv'), index=False)
y_test.to_csv(os.path.join(os.getcwd(), '../data/processed_data/y_test.csv'), index=False)

In [3]:
# Data Normalization: As you may notice, the data varies widely in scale, so normalization is necessary.
# You can use existing functions to construct this script.
# As output, this script will create two new datasets (X_train_scaled, X_test_scaled) which you will also save in data/processed.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/X_train.csv'))
X_test = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/X_test.csv'))

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled.to_csv(os.path.join(os.getcwd(), '../data/processed_data/X_train_scaled.csv'), index=False)
X_test_scaled.to_csv(os.path.join(os.getcwd(), '../data/processed_data/X_test_scaled.csv'), index=False)

print("Data normalized successfully.")

Data normalized successfully.


In [4]:
# GridSearch for Best Parameters: Decide on the regression model to implement and the parameters to test.
# At the end of this script, we will have the best parameters saved as a .pkl file in the models directory.

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X_train = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/X_train_scaled.csv'))
y_train = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/y_train.csv')).values.ravel()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

import joblib

joblib.dump(best_params, os.path.join(os.getcwd(), '../models/best_params.pkl'))
print("Best parameters saved successfully.")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters saved successfully.


In [5]:
# Model Training: Using the parameters found through GridSearch, we will train the model and save the trained model in the models directory.

X_train = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/X_train_scaled.csv'))
y_train = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/y_train.csv')).values.ravel()

best_params = joblib.load(os.path.join(os.getcwd(), '../models/best_params.pkl'))

rf = RandomForestRegressor(**best_params)

rf.fit(X_train, y_train)

joblib.dump(rf, os.path.join(os.getcwd(), '../models/trained_model.pkl'))

print("Model trained and saved successfully.")

Model trained and saved successfully.


In [6]:
# Model Evaluation: Finally, using the trained model, we will evaluate its performance and make predictions.
# At the end of this script, we will have a new dataset in data containing the predictions,
# along with a scores.json file in the metrics directory that will capture evaluation metrics of our model (e.g., MSE, R2).

X_test = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/X_test_scaled.csv'))
y_test = pd.read_csv(os.path.join(os.getcwd(), '../data/processed_data/y_test.csv')).values.ravel()

rf = joblib.load(os.path.join(os.getcwd(), '../models/trained_model.pkl'))

predictions = rf.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

scores = {
    'mse': mse,
    'r2': r2
}

import json

with open(os.path.join(os.getcwd(), '../metrics/scores.json'), 'w') as f:
    json.dump(scores, f)

predictions_df = pd.DataFrame(predictions, columns=['silica_concentrate'])


os.makedirs(os.path.join(os.getcwd(), '../data/predictions'), exist_ok=True)
predictions_df.to_csv(os.path.join(os.getcwd(), '../data/predictions/predictions.csv'), index=False)

print("Model evaluated successfully.")


Model evaluated successfully.
