# Importing the necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import LeakyReLU
LeakyReLU(negative_slope=0.01)  # Use negative_slope instead of alpha

from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR



# Loading the Dataset


In [2]:

file_path = "/content/TASK-ML-INTERN.csv"
data = pd.read_csv(file_path)



#  Droppin the non-numeric columns

#  Standardizing the data


In [3]:

data = data.drop(columns=['hsi_id'])


scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(data.iloc[:, :-1])



# Log-transforming target variable to reduce skewness

In [4]:
data['vomitoxin_ppb'] = np.log1p(data['vomitoxin_ppb'])



# Feature Selection using Lasso (increased alpha for better regularization)


In [5]:
lasso = Lasso(alpha=0.005, max_iter=10000)
lasso.fit(X_scaled, data['vomitoxin_ppb'])
selected_features = SelectFromModel(lasso, prefit=True)
X_selected = selected_features.transform(X_scaled)



  model = cd_fast.enet_coordinate_descent(


# Appling PCA with dynamic variance retention


In [6]:
pca = PCA(n_components=0.98)  # Retain 98% variance
X_pca = pca.fit_transform(X_selected)



# Split dataset

In [7]:

X_train, X_test, y_train, y_test = train_test_split(X_pca, data['vomitoxin_ppb'], test_size=0.2, random_state=42)



# Train an optimized XGBoost Regressor

In [8]:

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=2000,
    learning_rate=0.003,
    max_depth=10,
    subsample=0.9,
    colsample_bytree=0.9,
    early_stopping_rounds=100,
    eval_metric='rmse'
)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
y_pred_xgb = xgb_model.predict(X_test)



# Train an optimized Random Forest Regressor

In [10]:

rf_model = RandomForestRegressor(n_estimators=700, max_depth=18, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)



# Function to evaluate model on both training and test sets

In [11]:

def evaluate_model(model_name, y_train, y_train_pred, y_test, y_test_pred):
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)

    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)

    print(f"--- {model_name} ---")
    print(f"Train -> MAE: {mae_train:.4f}, RMSE: {rmse_train:.4f}, R²: {r2_train:.4f}")
    print(f"Test  -> MAE: {mae_test:.4f}, RMSE: {rmse_test:.4f}, R²: {r2_test:.4f}\n")



# Train an optimized SVR

In [12]:

svr_model = SVR(kernel='rbf', C=10, gamma='scale')
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)




# Build improved neural network model

In [13]:

from tensorflow.keras.layers import Input

model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Explicit input layer
    Dense(512, activation='swish'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation='swish'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='swish'),
    Dense(1, activation='linear')
])





# Compile model with dynamic learning rate adjustment

In [14]:

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)



# Callbacks for better training stability

In [15]:

callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-5),
    EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
]

# Train the model

In [16]:

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=500, batch_size=32, verbose=1, callbacks=callbacks)

# Predict on test set
y_pred_nn = model.predict(X_test).flatten()



Epoch 1/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step - loss: 24.3651 - mae: 4.3321 - val_loss: 27.7642 - val_mae: 4.8295 - learning_rate: 0.0010
Epoch 2/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 7.6697 - mae: 2.1941 - val_loss: 21.5233 - val_mae: 4.3111 - learning_rate: 0.0010
Epoch 3/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 8.3175 - mae: 2.2883 - val_loss: 19.2094 - val_mae: 4.0958 - learning_rate: 0.0010
Epoch 4/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 6.7579 - mae: 2.1157 - val_loss: 15.2380 - val_mae: 3.6526 - learning_rate: 0.0010
Epoch 5/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 7.1261 - mae: 2.0899 - val_loss: 13.2352 - val_mae: 3.3852 - learning_rate: 0.0010
Epoch 6/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 6.8261 - mae: 2.0200

# Predict on test set

In [17]:

y_pred_nn = model.predict(X_test).flatten()

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


# Function to evaluate model on both training and test sets


In [18]:

def evaluate_model(model_name, y_train, y_train_pred, y_test, y_test_pred):
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_train_pred)

    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_test_pred)

    print(f"--- {model_name} ---")
    print(f"Train -> MAE: {mae_train:.4f}, RMSE: {rmse_train:.4f}, R²: {r2_train:.4f}")
    print(f"Test  -> MAE: {mae_test:.4f}, RMSE: {rmse_test:.4f}, R²: {r2_test:.4f}\n")

# Predictions for training set

In [19]:

y_train_pred_xgb = xgb_model.predict(X_train)
y_train_pred_rf = rf_model.predict(X_train)
y_train_pred_svr = svr_model.predict(X_train)
y_train_pred_nn = model.predict(X_train).flatten()




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


# Evaluate models on both train and test sets

In [20]:

evaluate_model("XGBoost", y_train, y_train_pred_xgb, y_test, y_pred_xgb)
evaluate_model("Random Forest", y_train, y_train_pred_rf, y_test, y_pred_rf)
evaluate_model("SVR", y_train, y_train_pred_svr, y_test, y_pred_svr)
evaluate_model("Neural Network", y_train, y_train_pred_nn, y_test, y_pred_nn)

--- XGBoost ---
Train -> MAE: 0.7203, RMSE: 0.9549, R²: 0.8971
Test  -> MAE: 2.0458, RMSE: 2.6420, R²: 0.1402

--- Random Forest ---
Train -> MAE: 0.7556, RMSE: 0.9572, R²: 0.8966
Test  -> MAE: 2.0772, RMSE: 2.6427, R²: 0.1397

--- SVR ---
Train -> MAE: 1.4793, RMSE: 2.3996, R²: 0.3501
Test  -> MAE: 1.7142, RMSE: 2.6939, R²: 0.1061

--- Neural Network ---
Train -> MAE: 1.3023, RMSE: 1.7118, R²: 0.6693
Test  -> MAE: 1.8523, RMSE: 2.5333, R²: 0.2095

