In [14]:
%pwd

'/home/sarath_kumar/ImagoAI'

In [15]:
# import os
# os.chdir("../")
# %pwd

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Dropout

In [17]:



# Load dataset
df = pd.read_csv("data/TASK-ML-INTERN.csv")  # Replace with actual dataset
df.drop("hsi_id",axis=1,inplace = True)


# Remove outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Separate features and target
X = df.iloc[:, :-1].values  # All except last column
y = df.iloc[:, -1].values   # Last column (target variable)

# Normalize input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2,whiten=True,svd_solver="randomized")
X_pca = pca.fit_transform(X_scaled)

# Variance Explained
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Reshape data for CNN and LSTM (samples, time steps, features)
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)


Explained Variance Ratio: [0.86843952 0.05101268]


In [18]:
print(X_train.shape)

(310, 448, 1)


In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# Build Optimized CNN Model for Numerical Data
cnn_model = Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=256, kernel_size=3, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=128, kernel_size=3, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),

    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),

    Dense(1, activation='linear')  # Regression output
])

# Compile Model
cnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
# Implement Early Stopping to Prevent Underfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, mode='min')

# Train Model
cnn_model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 93ms/step - loss: 1188479.2500 - mae: 658.9849 - val_loss: 888972.1875 - val_mae: 564.0150
Epoch 2/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - loss: 733957.0000 - mae: 539.8466 - val_loss: 750207.3125 - val_mae: 508.6580
Epoch 3/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 83ms/step - loss: 1029242.0625 - mae: 689.2980 - val_loss: 741739.0625 - val_mae: 522.3793
Epoch 4/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms/step - loss: 701180.3750 - mae: 588.5511 - val_loss: 707154.3125 - val_mae: 541.9195
Epoch 5/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 75ms/step - loss: 646483.6250 - mae: 578.6796 - val_loss: 705272.6250 - val_mae: 565.2292
Epoch 6/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 77ms/step - loss: 725181.5625 - mae: 616.9727 - val_loss: 728392.3125 - val_mae: 623.1910
Epoch 7/50
[1m31/3

<keras.src.callbacks.history.History at 0x7fd94843a8c0>

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predictions
y_pred_cnn = cnn_model.predict(X_test)
# y_pred_lstm = lstm_model.predict(X_test)

# Compute metrics
def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")

evaluate_model(y_test, y_pred_cnn)
# evaluate_model(y_test, y_pred_lstm, "LSTM")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
MAE: 497.8959
RMSE: 703.5058
R² Score: 0.1743


### Interpretation of Metrics
**Mean Absolute Error (MAE: 497.8959):**
 - On average, the model's predictions are off by 497.89 units from the actual values.

**Root Mean Squared Error (RMSE: 703.5058):**
 - The model has a relatively high RMSE, indicating that large errors exist in predictions.


**R² Score (0.1743):**
 - The model explains only 17% of the variance in the target variable, which suggests poor predictive performance.

## TSNE


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt
from keras_tuner import RandomSearch

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Load dataset
df = pd.read_csv("data/TASK-ML-INTERN.csv")  # Replace with actual dataset
df.drop("hsi_id", axis=1, inplace=True)

# Remove outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Separate features and target
X = df.iloc[:, :-1].values  # All except last column
y = df.iloc[:, -1].values   # Last column (target variable)

# Normalize input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply t-SNE
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Reshape data for CNN and LSTM (samples, time steps, features)
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)


In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

# Build Optimized CNN Model for Numerical Data
cnn_model = Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=256, kernel_size=3, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=128, kernel_size=3, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),

    Flatten(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),

    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),

    Dense(1, activation='linear')  # Regression output
])

# Compile Model
cnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
# Implement Early Stopping to Prevent Underfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, mode='min')

# Train Model
cnn_model.fit(X_train, y_train, epochs=20, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 109ms/step - loss: 1021472.8750 - mae: 594.8243 - val_loss: 897742.6875 - val_mae: 568.6235
Epoch 2/20
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 96ms/step - loss: 582689.5625 - mae: 522.1036 - val_loss: 719694.5625 - val_mae: 502.8279
Epoch 3/20
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 99ms/step - loss: 751694.1250 - mae: 629.3843 - val_loss: 682398.3750 - val_mae: 509.4200
Epoch 4/20
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - loss: 879750.3125 - mae: 622.3510 - val_loss: 663486.2500 - val_mae: 508.3220
Epoch 5/20
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 95ms/step - loss: 722706.7500 - mae: 582.5474 - val_loss: 631469.5000 - val_mae: 528.9585
Epoch 6/20
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 110ms/step - loss: 647799.6250 - mae: 576.0213 - val_loss: 624882.3750 - val_mae: 521.6489
Epoch 7

<keras.src.callbacks.history.History at 0x7fd968ad3d00>

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predictions
y_pred_cnn = cnn_model.predict(X_test)


# Compute metrics
def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")

evaluate_model(y_test, y_pred_cnn)



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
MAE: 514.5278
RMSE: 752.6269
R² Score: 0.0550


#### **Model Performance Evaluation**

#### **1. Evaluation Metrics**
The regression model was evaluated using **Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R² Score**.

- **MAE:** `514.5278`  
  - This means that, on average, the model's predictions deviate by approximately **514.53** units from the actual values.  

- **RMSE:** `752.6269`  
  - The RMSE indicates that the model's errors have a standard deviation of **752.63** units.  
  - Since RMSE penalizes large errors more than MAE, it suggests the presence of some outliers or high-variance errors.

- **R² Score:** `0.0550`  
  - The R² score of **0.0550** indicates that the model explains only **5.5% of the variance** in the target variable.  
  - A low R² value suggests that the model is **not capturing the underlying pattern well** and might be underfitting.


#### **2. Key Observations**
- The **slight improvement in R² Score** (from `0.0371` to `0.0550`) suggests **minor enhancement**, but the model **still performs poorly**.
- High **MAE and RMSE values** indicate that the model **lacks accuracy** in predictions.
- A **low R² Score** suggests that **features may not be strongly correlated with the target** or that **feature selection and engineering need improvement**.

  
   
