<a href="https://colab.research.google.com/github/pratyushb25/dragline_factors/blob/main/Dragline_Factors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt
from joblib import dump

# Load in dataset
df = pd.read_excel('Dragline Factors V1.xlsx')

# Selecting input and output columns from spreadsheet
input_cols = df.columns[0:8]  # A to H
output_cols = df.columns[32:39]  # AG to AM

# Preprocessing the input data
# Assuming categorical data in the first two columns and numerical in the rest
preprocessor = make_column_transformer(
    (OneHotEncoder(), ['Dragline', 'Method']),
    (StandardScaler(), input_cols[2:])
)

# Apply the preprocessing to input data
X = preprocessor.fit_transform(df[input_cols])
y = df[output_cols].values

dump(preprocessor, 'preprocessor.joblib')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a more complex neural network model with regularization
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(y_train.shape[1])
])

# Compile the model with a learning rate scheduler
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Define a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 100:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Add the scheduler to the list of callbacks
callbacks_list = [
    keras.callbacks.LearningRateScheduler(scheduler),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=150, restore_best_weights=True)
] # 150 epochs without improvement stops the model to prevent overfitting

# Train the model with the scheduler and early stopping
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=500,
    batch_size=32,
    callbacks=callbacks_list
)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test, y_test)

# Make predictions (for new data or the test set)
predictions = model.predict(X_test)

# Select 20 samples for comparison
num_samples = 20
samples_indices = np.random.choice(range(len(X_test)), num_samples, replace=False)

# Prepare data for the DataFrame
data_for_df = []
for i in samples_indices:
    actual = y_test[i]
    predicted = predictions[i]
    error = np.abs(actual - predicted)
    data_for_df.append(np.concatenate([actual, predicted, error]))

# Column labels
labels = ['AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM']
columns = [f'Actual {label}' for label in labels] + \
          [f'Predicted {label}' for label in labels] + \
          [f'Error {label}' for label in labels]

# Create DataFrame
comparison_df = pd.DataFrame(data_for_df, columns=columns)

# Display the formatted DataFrame
print(comparison_df)

# Calculate the mean absolute error for the entire test set
mean_absolute_error = np.mean(np.abs(predictions - y_test), axis=0)
print(f"Mean Absolute Error on Test Data: {mean_absolute_error}")

# Visualize the training process
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.title('Model Loss Progression Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.show()

#model.save('Dragline_Factors.h5')  # Saves to an HDF5 file
#from google.colab import files
#files.download('Dragline_Factors.h5')

# Download the preprocessor file
#files.download('preprocessor.joblib')


FileNotFoundError: [Errno 2] No such file or directory: 'Dragline Factors V1.xlsx'

In [None]:
from IPython.display import display

# Function to make predictions from an Excel file and return a formatted DataFrame
def make_predictions_from_excel(file_path, output_columns):
    # Load new data
    new_data_df = pd.read_excel(file_path)

    # Preprocess the new input data using the same preprocessor used during training
    new_X = preprocessor.transform(new_data_df)

    # Use the model to make predictions
    new_predictions = model.predict(new_X)

    # Round the predictions to match the original output formatting
    new_predictions_rounded = np.round(new_predictions, 6)

    # Create a DataFrame with the predictions and the specified output columns
    predictions_df = pd.DataFrame(new_predictions_rounded, columns=output_columns)

    # Return the DataFrame with predictions
    return predictions_df

# Column headings for the outputs as per your original dataset
output_columns = ['AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM']  # Replace with your actual column names

# Example usage:
new_data_file_path = 'DF_test_1.xlsx'
predictions_df = make_predictions_from_excel(new_data_file_path, output_columns)

# Print the DataFrame in a nicely formatted table
print("Predictions for the new input data:")
display(predictions_df)

Predictions for the new input data:


Unnamed: 0,AG,AH,AI,AJ,AK,AL,AM
0,0.2521,0.019588,0.252433,2.704471,3.725592,0.024623,0.02466
1,0.16947,0.308222,0.361046,0.525328,0.643663,0.00247,0.002451
2,0.110105,0.104783,0.358518,1.772775,1.974472,0.007542,0.007549
3,0.611977,0.381463,2.467392,1.969724,3.302143,0.034636,0.034698
4,0.205633,0.80825,1.016715,0.982102,1.272301,0.004128,0.004316
5,0.109619,0.097631,0.375593,1.788924,2.522036,1.004431,1.004531
6,0.229582,0.01063,0.214794,2.756428,3.570595,0.002298,0.002309
