# Variable Selection

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import keras
import warnings

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from keras.models import Sequential
from keras.layers import Dense
from hpelm import ELM
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, auc, accuracy_score
warnings.filterwarnings("ignore")

2023-05-02 20:41:20.999710: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv("Group_14_data_cleaned.csv")
data.head()

Unnamed: 0,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,0.444444,0
1,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,0.444444,0
2,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,0.444444,0
3,732,7***n,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,0.444444,0
4,900,z***z,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,0.666667,0


In [3]:
# Split the data into training and testing sets
X = data.drop(["Auction_ID", 'Class', 'Bidder_ID'], axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler


# Standardize the features using the StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit a Lasso regression model on the training data
alpha = 0.004  # regularization strength
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)

# Select the features with non-zero coefficients
selected_features_Lasso = X.columns[lasso.coef_ != 0]

# Print the selected features
print("Selected features using Lasso:", selected_features_Lasso)


corr = data.corr()['Class'].sort_values(ascending=False)
selected_features = corr[corr > 0.1].index.tolist()
selected_features.remove('Class')

# Print the selected features
print("Selected features using correlations: ", selected_features)


Selected features using Lasso: Index(['Successive_Outbidding', 'Last_Bidding', 'Winning_Ratio',
       'Auction_Duration'],
      dtype='object')
Selected features using correlations:  ['Successive_Outbidding', 'Bidding_Ratio', 'Winning_Ratio', 'Bidder_Tendency']


In [5]:
print(corr)

Class                     1.000000
Successive_Outbidding     0.901035
Bidding_Ratio             0.569435
Winning_Ratio             0.394122
Bidder_Tendency           0.295533
Last_Bidding              0.097655
Early_Bidding             0.053570
Auction_Bids              0.044964
Starting_Price_Average    0.042604
Auction_Duration          0.021145
Auction_ID               -0.007985
Name: Class, dtype: float64


let's look at correlations. Correlation measures how strong a relationship exists between two variables. In feature selection, we look at the correlation between the independent variables and the target variable. If the correlation is high, it indicates that the independent variable has a strong relationship with the target variable and hence, may be a good predictor. In this case, we can see that 'Successive_Outbidding', 'Starting_Price_Average', 'Early_Bidding', 'Winning_Ratio', and 'Class' are highly correlated with each other.

However, correlation alone may not be the best method for feature selection, as it only measures linear relationships between variables. It may not capture the complex relationships that may exist in the data. In other words, two variables may be highly related, but not necessarily in a linear fashion.

This is where Lasso comes in. Lasso is a regularization technique that is commonly used for feature selection. It works by shrinking the coefficients of less important variables towards zero, effectively reducing the number of features in the model. Lasso is particularly useful when dealing with high-dimensional data, where the number of features is much larger than the number of samples.

In this case, we can see that Lasso has selected a subset of features that are important for predicting the target variable, namely 'Bidder_Tendency', 'Successive_Outbidding', 'Last_Bidding', 'Starting_Price_Average', 'Winning_Ratio', and 'Auction_Duration'. By using Lasso, we can effectively reduce the number of features while still maintaining good predictive performance.

# Training the best models from part 2 and part 3  on selected features using **LASSO**


In [6]:
# Split the data into training and testing sets
X = data[['Bidder_Tendency', 'Successive_Outbidding', 'Last_Bidding',
       'Winning_Ratio']]
# X = data[['Successive_Outbidding', 'Bidding_Ratio', 'Winning_Ratio', 'Bidder_Tendency']]
y = data['Class']
X_train_selected, X_test_selected, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Define the neural network model
def create_model_(num_layers=1, num_neurons=10):
    model = Sequential()
    model.add(Dense(num_neurons, input_dim=X_train_selected.shape[1], activation='relu'))
    for i in range(num_layers):
        model.add(Dense(num_neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
model = MLPClassifier(hidden_layer_sizes=(64, 64), random_state=42)
model.fit(X_train_selected, y_train)

In [9]:
# BEST MODEL FROM PART - 2 (Random Forest)
# Best hyperparameters:  
# {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

# Define the random forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=1, min_samples_split=5)
rf_model.fit(X_train_selected, y_train)
y_pred_rf = rf_model.predict(X_test_selected)
acc_rf = accuracy_score(y_test, y_pred_rf)

In [10]:
# Evaluate the model on the testing set
accuracy_nn = model.score(X_test_selected, y_test)
# Print the accuracy without rounding
print("Accuracy of the best part 2 model (Rf) on selected features : {:.4f}".format(acc_rf))
print("Accuracy of the best part 3 model (MLP) on selected features: {:.4f}".format(accuracy_nn))

Accuracy of the best part 2 model (Rf) on selected features : 0.9731
Accuracy of the best part 3 model (MLP) on selected features: 0.9771


In [11]:
y_pred_nn = model.predict(X_test_selected)
y_pred_nn = np.round(y_pred_nn)
nn_report = classification_report(y_test, y_pred_nn)
nn_cnf_matrix = confusion_matrix(y_test, y_pred_nn)


y_pred_rf = rf_model.predict(X_test_selected)
rf_report = classification_report(y_test, y_pred_rf)
rf_cnf_matrix = confusion_matrix(y_test, y_pred_rf)

In [12]:
print("RF Report:")
print(nn_cnf_matrix)
print(rf_report)

print("-- -- -- -- -- -- -- -- -- -- -- -- ")

print("NN Report:")
print(rf_cnf_matrix)
print(nn_report)

RF Report:
[[1108   27]
 [   2  128]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1135
           1       0.83      0.92      0.88       130

    accuracy                           0.97      1265
   macro avg       0.91      0.95      0.93      1265
weighted avg       0.97      0.97      0.97      1265

-- -- -- -- -- -- -- -- -- -- -- -- 
NN Report:
[[1111   24]
 [  10  120]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1135
           1       0.83      0.98      0.90       130

    accuracy                           0.98      1265
   macro avg       0.91      0.98      0.94      1265
weighted avg       0.98      0.98      0.98      1265

