In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Function to split the data and scale features
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function to predict and calculate R² score
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

# Functions to train and evaluate different regression models
def Linear(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def svm_linear(X_train, y_train, X_test, y_test):
    regressor = SVR(kernel='linear')
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def svm_NL(X_train, y_train, X_test, y_test):
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def Decision(X_train, y_train, X_test, y_test):
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def random(X_train, y_train, X_test, y_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

# Function to perform RFE and return selected features
def rfeFeature(indep_X, dep_Y, n):
    rfemodellist = [
        LinearRegression(),
        SVR(kernel='linear'),
        DecisionTreeRegressor(random_state=0),
        RandomForestRegressor(n_estimators=10, random_state=0)
    ]
    rfelist = []
    
    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = indep_X.iloc[:, log_rfe.support_]
        rfelist.append(log_rfe_feature)
    
    return rfelist



In [2]:
# Load and preprocess the data
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = pd.get_dummies(dataset1, drop_first=True)

indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

# Perform RFE to select top 3 features
rfelist = rfeFeature(indep_X, dep_Y, 3)
print("Selected Features by RFE for each model:", rfelist)

# Initialize lists to store accuracy metrics
acclin = []
accsvml = []
accsvmnl = []
accdes = []
accrf = []



Selected Features by RFE for each model: [     sg_c  sg_d  sg_e
0       1     0     0
1       1     0     0
2       0     0     0
3       0     1     0
4       1     0     0
..    ...   ...   ...
394     0     0     0
395     1     0     0
396     1     0     0
397     0     0     0
398     0     0     0

[399 rows x 3 columns],      sg_c  sg_d  sg_e
0       1     0     0
1       1     0     0
2       0     0     0
3       0     1     0
4       1     0     0
..    ...   ...   ...
394     0     0     0
395     1     0     0
396     1     0     0
397     0     0     0
398     0     0     0

[399 rows x 3 columns],           hrmo  sg_c  sg_d
0    12.518156     1     0
1    10.700000     1     0
2    12.000000     0     0
3     8.100000     0     1
4    11.800000     1     0
..         ...   ...   ...
394  12.500000     0     0
395   8.700000     1     0
396   9.100000     1     0
397   8.500000     0     0
398  16.300000     0     0

[399 rows x 3 columns],       al       hrmo  sg_d
0    

In [3]:
# Function to create a DataFrame comparing model performances
def rfe_regression(acclog, accsvml, accdes, accrf): 
    rfedataframe = pd.DataFrame(index=['Linear', 'SVC', 'Random', 'DecisionTree'], columns=['Linear', 'SVMl', 'Decision', 'Random'])
    
    for number, idex in enumerate(rfedataframe.index):
        rfedataframe['Linear'][idex] = acclog[number] if number < len(acclog) else None
        rfedataframe['SVMl'][idex] = accsvml[number] if number < len(accsvml) else None
        rfedataframe['Decision'][idex] = accdes[number] if number < len(accdes) else None
        rfedataframe['Random'][idex] = accrf[number] if number < len(accrf) else None
    
    return rfedataframe

# Train and evaluate models using selected features
for i in rfelist:
    X_train, X_test, y_train, y_test = split_scalar(i, dep_Y)
    
    acclin.append(Linear(X_train, y_train, X_test, y_test))
    accsvml.append(svm_linear(X_train, y_train, X_test, y_test))
    accsvmnl.append(svm_NL(X_train, y_train, X_test, y_test))
    accdes.append(Decision(X_train, y_train, X_test, y_test))
    accrf.append(random(X_train, y_train, X_test, y_test))

# Create a DataFrame to compare model performances
result = rfe_regression(acclin, accsvml, accdes, accrf)
print(result)


                Linear      SVMl  Decision    Random
Linear        0.441961  0.262153  0.441961  0.441816
SVC           0.441961  0.262153  0.441961  0.441816
Random        0.664893  0.609652  0.965961  0.916304
DecisionTree  0.676174  0.670691  0.933504  0.887256
