In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
from math import sqrt


In [2]:

# Load
df_projects = pd.read_csv('mining_projects.csv', sep=';')

df_factory =  pd.read_csv('factory.csv', sep=';')

print(df_projects.head())


print(df_projects.dtypes)

   Project No.  Deposit No.                    Company Name  \
0            1          1.0               Lynas Rare Earths   
1            2          2.0                    MP Materials   
2            3          3.0         Iluka Resources Limited   
3            4          4.0                             NaN   
4            5          5.0  Australian Strategic Materials   

              Project Name                                       Location  \
0               Mount Weld                          Mount Weld, Australia   
1            Mountain Pass     Central Mojave Desert, California, America   
2                  Eneabba          Eneabba, Western Australia, Australia   
3  WIM100 (Wimmera、Mallee)  Wimmera & Mallee,Western Australia, Australia   
4           Dubbo (Toongi)              Dubbo, New South Wales, Australia   

       Continent  \
0      Australia   
1  North America   
2      Australia   
3      Australia   
4      Australia   

   Status (2022), 1: Exploration/ Def

In [3]:
print(df_factory.head())

   No.            Company              Project  \
0    1  Lynas Rare Earths     Mount Weld Plant   
1    2                NaN        Kuantan Plant   
2    3                NaN     Kalgoorlie Plant   
3    4                NaN  HREE separate Plant   
4    5       MP Materials  Mountain Pass Plant   

                                 Location  \
0                   Mount Weld, Australia   
1                                 Kuantan   
2                    Kalgoorlie,Australia   
3                       Hondo, Texas, USA   
4  Central Mojave Desert, California, USA   

  Status (2022)  1: Crushing roughing: Operate mineral concentration; 3: mixed REO; 4:Operate seprated REO; 5 Metal; T Trial production; P Pre-feasibility study; PT Project terminated  \
0                                                  2                                                                                                                                      
1                                                  4  

Failed attempt to predict the status of the project
-----------------------------------------------------------------------------

In [4]:
# Data preprocessing


# Convert categorical columns to numerical using LabelEncoder
label_encoder = LabelEncoder()
categorical_cols = ['Company Name', 'Location', 'Continent', 'Deposit type', 'Project Name']
for col in categorical_cols:
    df_projects[col] = label_encoder.fit_transform(df_projects[col])

#df_projects = df_projects.rename(columns={'Status (2022), 1: Exploration/ Define resource; 2: Feasibility/ Pre-feasibility; 3: Construction; 4: Production; S: Suspending mining; T: Test of Metallurgical; D: Development licence; N: Not Use': 'Status'}, inplace=True))

col_list = df_projects.columns.tolist()

col_list[6] = 'Status'

df_projects.columns = col_list


df_projects['status_cleaned'] = np.nan
df_projects['year'] = np.nan

for idx, row in df_projects.iterrows():
    status = row['Status']
    
    if pd.isna(status):
        continue

    
    if '(' in status and ')' in status:
        status_splitted = status.split('(')
        df_projects.at[idx, 'status_cleaned'] = status_splitted[0].strip()
        df_projects.at[idx, 'year'] = status_splitted[1].replace(')', '').strip()
    else:
        df_projects.at[idx, 'status_cleaned'] = status.strip()


df_projects['year'] = pd.to_numeric(df_projects['year'], errors='coerce')


df_projects.drop(['Status'], axis=1, inplace=True)

# Convert string to numeric
df_projects['HREE percentage'] = pd.to_numeric(df_projects['HREE percentage'].str.replace(',', '.'), errors='coerce')

# Fill NaNs in 'year' 
df_projects['year'] = df_projects['year'].fillna(-1)

# Fill NaNs in 'HREE percentage' 
mean_value = df_projects['HREE percentage'].mean()
df_projects['HREE percentage'] = df_projects['HREE percentage'].fillna(mean_value)

cols_to_drop = ['REE pattern Ref.', 'Project and status Ref.', 'Deposit type Ref.', 'Resource and grade Ref.', 'Unnamed: 30']
df_projects.drop(cols_to_drop, axis=1, inplace=True)

status_splits = df_projects['status_cleaned'].str.split(',').apply(pd.Series, 1).stack()
status_splits.index = status_splits.index.droplevel(-1)
status_splits.name = 'status_cleaned_new'
df_projects = df_projects.join(status_splits)
df_projects['status_cleaned_new'] = df_projects['status_cleaned_new'].str.strip()  


status_mapping = {
    '1': 'Early Stage',
    '2': 'Middle Stage',
    '3': 'Late Stage',
    '4': 'Production',
    'T': 'Testing',
    'D': 'Development',
    'S': 'Suspending',
    'C': 'Construction',
    'N': 'Not Use'
    # Add mappings for other statuses as needed
}
df_projects['status_simplified'] = df_projects['status_cleaned_new'].map(status_mapping)

df_projects.loc[df_projects['status_cleaned_new'] == '1&T', 'status_cleaned_new'] = 'Early Stage' # or 'Test of Metallurgical', as appropriate
df_projects.loc[df_projects['status_cleaned_new'] == '2021', 'status_cleaned_new'] = np.nan # Treat '2021' as NaN
df_projects.loc[df_projects['status_cleaned_new'] == 'N', 'status_cleaned_new'] = np.nan # Treat 'N' as NaN


In [5]:

unique_values = df_projects['status_cleaned_new'].unique()
print(unique_values)

['4' '2' '1' '3' 'D' 'T' 'S' 'C' 'Early Stage' nan]


In [6]:
# Define input features and target variable
input_features = ['Deposit No.', 'Company Name', 'Location', 'Continent', 'Deposit type', 'HREE percentage', 'year']
target_variable = 'status_cleaned'

# Prepare data for train/test split
X = df_projects[input_features]
y = df_projects[target_variable]

# Normalize X
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Replace NaNs
#X_train = np.nan_to_num(0)
#X_test = np.nan_to_num(0)

#y_train = np.nan_to_num(0)
#y_test = np.nan_to_num(0)

print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)
print(y_test.dtype)

float64
float64
object
object


Predicting REE amounts in a project based on Location, Continent, and Deposit type
-----------------------------------------------


In [7]:
elements = ['La2O3', 'Ce2O3', 'Pr6O11', 'Nd2O3', 'Sm2O3', 'Eu2O3', 'Gd2O3', 'Tb4O7', 'Dy2O3', 'Ho2O3', 'Er2O3', 'Tm2O3', 'Yb2O3', 'Lu2O3', 'Y2O3']

for element in elements:
    df_projects[element] = df_projects[element].replace('-', np.nan)
    df_projects[element] = df_projects[element].str.replace(',', '.').astype(float)

df_projects = df_projects.dropna(subset=elements)

input_features = ['Location', 'Continent', 'Deposit type']
target_variables = elements

X = df_projects[input_features]
y = df_projects[target_variables]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, activation='relu', input_shape=[len(input_features)]),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(len(target_variables))  
])

model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(X_train, y_train, epochs=10, validation_split=0.2)

predictions = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Overall Test RMSE: {rmse}')

y_pred_df = pd.DataFrame(predictions, columns=target_variables)

rmse_values = {}

for element in target_variables:
    mse = mean_squared_error(y_test[element], y_pred_df[element])
    rmse_values[element] = sqrt(mse)

for element, rmse in rmse_values.items():
    print(f'RMSE for {element}: {rmse}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Overall Test RMSE: 13.971162801542093
RMSE for La2O3: 24.34748011035836
RMSE for Ce2O3: 41.68715974839547
RMSE for Pr6O11: 4.376922544263042
RMSE for Nd2O3: 14.885880670496979
RMSE for Sm2O3: 2.1857432233696863
RMSE for Eu2O3: 0.6569728372571835
RMSE for Gd2O3: 1.9954571966040606
RMSE for Tb4O7: 0.3435716324225767
RMSE for Dy2O3: 2.774669436293423
RMSE for Ho2O3: 0.49334425101842083
RMSE for Er2O3: 1.6219806070795328
RMSE for Tm2O3: 0.3431000490795962
RMSE for Yb2O3: 1.619629131937283
RMSE for Lu2O3: 0.20189353642631178
RMSE for Y2O3: 18.27214343129778


Don't run the below models
----------------------------------------------------

In [None]:

elements = ['La2O3', 'Ce2O3', 'Pr6O11', 'Nd2O3', 'Sm2O3', 'Eu2O3', 'Gd2O3', 'Tb4O7', 'Dy2O3', 'Ho2O3', 'Er2O3', 'Tm2O3', 'Yb2O3', 'Lu2O3', 'Y2O3']

for element in elements:
    df_projects[element] = df_projects[element].replace('-', np.nan)
    df_projects[element] = df_projects[element].str.replace(',', '.').astype(float)


df_projects = df_projects.dropna(subset=elements)

input_features = ['Location', 'Continent', 'Deposit type']
target_variables = elements

X = df_projects[input_features]
y = df_projects[target_variables]


scaler = StandardScaler()
X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, activation='relu', input_shape=[len(input_features)]),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(len(target_variables))  
])


model.compile(loss='mean_squared_error', optimizer='adam')


model.fit(X_train, y_train, epochs=10, validation_split=0.2)


predictions = model.predict(X_test)


rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Test RMSE: {rmse}')


In [None]:

elements = ['La2O3', 'Ce2O3', 'Pr6O11', 'Nd2O3', 'Sm2O3', 'Eu2O3', 'Gd2O3', 'Tb4O7', 'Dy2O3', 'Ho2O3', 'Er2O3', 'Tm2O3', 'Yb2O3', 'Lu2O3', 'Y2O3']

for element in elements:
    df_projects[element] = df_projects[element].replace('-', np.nan)
    df_projects[element] = df_projects[element].str.replace(',', '.').astype(float)

df_projects = df_projects.dropna(subset=elements)

input_features = ['Location', 'Continent', 'Deposit type']
target_variables = elements

X = df_projects[input_features]
y = df_projects[target_variables]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(64, activation='relu', input_shape=[len(input_features)]),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(len(target_variables))
])

model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(X_train, y_train, epochs=10, validation_split=0.2)

y_pred = model.predict(X_test)


y_pred_df = pd.DataFrame(y_pred, columns=target_variables)


rmse_values = {}


for element in target_variables:
    mse = mean_squared_error(y_test[element], y_pred_df[element])
    rmse_values[element] = sqrt(mse)

for element, rmse in rmse_values.items():
    print(f'RMSE for {element}: {rmse}')
