In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Correct import
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the Excel file
file_path = '/kaggle/input/combine-raw-with-ref/COMBINE_RAW WITH REF.xlsx'  # Path to the uploaded file
sheet_name = 0  # Assuming the data is in the first sheet

# Read the Excel file
data = pd.read_excel(file_path, sheet_name=sheet_name, header=None)

# Step 2: Preprocess the data
# First column is the reference number, remaining columns are wavelengths
X = data.iloc[:, 1:]  # Feature columns (wavelength values)
y = data.iloc[:, 0]   # Target column (reference numbers)

# Check the first few rows of the data
print("Data Head:")
print(data.head())

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a Random Forest Regressor
print("\nTraining the Random Forest model...")
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = rf_regressor.predict(X_test)

# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

# Step 7: Feature Importance (optional)
importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print("\nFeature Importances:")
print(feature_importance_df.sort_values(by='Importance', ascending=False))


Data Head:
    0         1         2         3         4         5         6         7    \
0  4.52  0.122818  0.116610  0.121734  0.122586  0.120086  0.118656  0.113658   
1  4.52  0.127544  0.119294  0.126012  0.120034  0.116881  0.117877  0.120391   
2  4.52  0.123760  0.118398  0.129466  0.116654  0.119283  0.114003  0.112175   
3  4.52  0.120008  0.118398  0.125153  0.119186  0.116083  0.120218  0.116637   
4  4.52  0.119075  0.113058  0.117497  0.116654  0.107405  0.114003  0.110698   

        8         9    ...       247       248       249       250       251  \
0  0.116506  0.114872  ...  0.196501  0.196715  0.197291  0.197097  0.196473   
1  0.118690  0.119876  ...  0.196660  0.196657  0.197354  0.196595  0.197537   
2  0.112170  0.116296  ...  0.191195  0.192000  0.192912  0.192454  0.191758   
3  0.112889  0.113453  ...  0.190569  0.190630  0.190143  0.190115  0.190709   
4  0.112889  0.110628  ...  0.185025  0.185078  0.185216  0.185126  0.183828   

        252       253

In [17]:
import numpy as np

new_wavelengths = np.array([[0.138125616,0.121994356,0.121733597,0.12258599,0.120086234,0.128912181,0.125701326,0.124568652,0.126396102,0.120722252,0.124295813,0.123717092,0.131551499,0.133393417,0.133170135,0.130318522,0.129345393,0.127393993,0.125594625,0.12434383,0.123527291,0.122889098,0.122492167,0.122327194,0.122200721,0.121821665,0.122064925,0.12174124,0.121271633,0.120840758,0.12066432,0.120372084,0.119904174,0.119480863,0.119193902,0.1188327,0.118475534,0.118306415,0.118273041,0.117723034,0.117362276,0.116815054,0.116956836,0.116794406,0.116657887,0.116508966,0.11574101,0.115682371,0.115396301,0.115391965,0.115027513,0.115192682,0.114952807,0.114956223,0.114787473,0.114182319,0.114104174,0.114055222,0.113838983,0.114490915,0.114345278,0.114361113,0.114210008,0.114607779,0.115373035,0.115623093,0.116352392,0.11646661,0.116809054,0.117218913,0.117140798,0.117071673,0.117409562,0.118210342,0.118878878,0.119552447,0.11974389,0.120364893,0.121161207,0.121613236,0.122075889,0.122658209,0.123235406,0.123968421,0.12413945,0.124448595,0.124248011,0.124375356,0.123940781,0.123923304,0.124057141,0.124058975,0.124228827,0.123658005,0.123422529,0.122976741,0.122672363,0.122358957,0.122096043,0.121407932,0.1210818,0.121215754,0.121106803,0.120925148,0.120306126,0.119931641,0.120139854,0.120014157,0.120018408,0.120195753,0.119649123,0.119793622,0.119499857,0.119403589,0.118833975,0.118575868,0.11858716,0.118530777,0.118073351,0.118293502,0.118754171,0.118956522,0.119359614,0.119232639,0.119735662,0.120898881,0.121373543,0.121825785,0.123071499,0.124608783,0.125632876,0.126730311,0.128265052,0.129565707,0.130602433,0.132321054,0.133111401,0.134672706,0.135988275,0.13584759,0.13610225,0.137478529,0.139153782,0.140718377,0.142198437,0.144424154,0.147017316,0.150133573,0.154293972,0.159791737,0.165598269,0.171983882,0.178312546,0.184920757,0.192120386,0.199045155,0.204894138,0.20995598,0.214811033,0.218284027,0.220474841,0.222364398,0.223287714,0.224046885,0.223986826,0.223779594,0.223134323,0.223743036,0.22325906,0.222600107,0.221645476,0.220752283,0.219444589,0.218223663,0.217436431,0.216803581,0.216436189,0.215192733,0.214238284,0.212958327,0.211810566,0.210831909,0.210042333,0.209754383,0.209047058,0.208170143,0.207056665,0.206012154,0.204671164,0.20472651,0.204352789,0.203937256,0.203939428,0.203591443,0.203394115,0.203429659,0.203163416,0.203870434,0.202760989,0.202814237,0.203370489,0.20323151,0.203381865,0.203091504,0.203203011,0.202811293,0.202150094,0.201374504,0.201828745,0.201789463,0.200438493,0.200290915,0.199909073,0.198884064,0.197923934,0.197615596,0.196926663,0.196313181,0.195985941,0.195293588,0.195120537,0.19464844,0.193860765,0.193369227,0.194046925,0.194555887,0.194900565,0.194261982,0.194292516,0.19412707,0.194442254,0.195770509,0.197286772,0.198435659,0.199930331,0.201509654,0.203033664,0.204144861,0.205222138,0.205555702,0.205512968,0.205441242,0.20562503,0.204195142,0.203612053,0.202727073,0.20096429,0.200780817,0.200234194,0.199184765,0.198274635,0.197130018,0.197549423,0.197504479,0.197431381,0.197926611
]])  # Example wavelength input (1 row, 5 columns)
predicted_ref_no = rf_regressor.predict(new_wavelengths)

print("\nPredicted Reference Number for the input wavelengths:")
print(predicted_ref_no)


Predicted Reference Number for the input wavelengths:
[4.5535]
