In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Function to split the data and scale features
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function to predict and calculate R² score
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

# Functions to train and evaluate different regression models
def Linear(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def svm_linear(X_train, y_train, X_test, y_test):
    regressor = SVR(kernel='linear')
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def svm_NL(X_train, y_train, X_test, y_test):
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def Decision(X_train, y_train, X_test, y_test):
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

def random(X_train, y_train, X_test, y_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

# Function to perform RFE and return selected features
def rfeFeature(indep_X, dep_Y, n):
    rfemodellist = [
        LinearRegression(),
        SVR(kernel='linear'),
        DecisionTreeRegressor(random_state=0),
        RandomForestRegressor(n_estimators=10, random_state=0)
    ]
    rfelist = []
    
    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = indep_X.iloc[:, log_rfe.support_]
        rfelist.append(log_rfe_feature)
    
    return rfelist

In [12]:
# Load and preprocess the data
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = pd.get_dummies(dataset1, drop_first=True)

indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

# Perform RFE to select top 3 features
rfelist = rfeFeature(indep_X, dep_Y, 6)
print("Selected Features by RFE for each model:", rfelist)

# Initialize lists to store accuracy metrics
acclin = []
accsvml = []
accsvmnl = []
accdes = []
accrf = []


Selected Features by RFE for each model: [     sg_b  sg_c  sg_d  sg_e  htn_yes  dm_yes
0       0     1     0     0        0       0
1       0     1     0     0        0       0
2       0     0     0     0        0       0
3       0     0     1     0        0       0
4       0     1     0     0        0       0
..    ...   ...   ...   ...      ...     ...
394     0     0     0     0        0       0
395     0     1     0     0        1       1
396     0     1     0     0        1       1
397     0     0     0     0        1       1
398     0     0     0     0        0       0

[399 rows x 6 columns],      sg_c  sg_d  sg_e  rbc_normal  htn_yes  dm_yes
0       1     0     0           1        0       0
1       1     0     0           1        0       0
2       0     0     0           1        0       0
3       0     1     0           1        0       0
4       1     0     0           1        0       0
..    ...   ...   ...         ...      ...     ...
394     0     0     0           1   

In [13]:
# Train and evaluate models using selected features
for features in rfelist:
    X_train, X_test, y_train, y_test = split_scalar(features, dep_Y)
    
    acclin.append(Linear(X_train, y_train, X_test, y_test))
    accsvml.append(svm_linear(X_train, y_train, X_test, y_test))
    accsvmnl.append(svm_NL(X_train, y_train, X_test, y_test))
    accdes.append(Decision(X_train, y_train, X_test, y_test))
    accrf.append(random(X_train, y_train, X_test, y_test))

# Function to create a DataFrame comparing model performances
def rfe_regression(acclin, accsvml, accsvmnl, accdes, accrf): 
    rfedataframe = pd.DataFrame(index=['Linear', 'SVR Linear', 'SVR Non-Linear', 'DecisionTree', 'RandomForest'], 
                                columns=['Linear', 'SVR Linear', 'SVR Non-Linear', 'DecisionTree', 'RandomForest'])
    
    for number, idex in enumerate(rfedataframe.index):
        rfedataframe['Linear'][idex] = acclin[number] if number < len(acclin) else None
        rfedataframe['SVR Linear'][idex] = accsvml[number] if number < len(accsvml) else None
        rfedataframe['SVR Non-Linear'][idex] = accsvmnl[number] if number < len(accsvmnl) else None
        rfedataframe['DecisionTree'][idex] = accdes[number] if number < len(accdes) else None
        rfedataframe['RandomForest'][idex] = accrf[number] if number < len(accrf) else None
    
    return rfedataframe

# Create a DataFrame to compare model performances
result = rfe_regression(acclin, accsvml, accsvmnl, accdes, accrf)

In [5]:
print(result)
#3

                  Linear SVR Linear SVR Non-Linear DecisionTree RandomForest
Linear          0.441961   0.262153       0.262162     0.441961     0.441816
SVR Linear      0.441961   0.262153       0.262162     0.441961     0.441816
SVR Non-Linear  0.664893   0.609652       0.883134     0.965961     0.916304
DecisionTree    0.676174   0.670691       0.900941     0.933504     0.887256
RandomForest        None       None           None         None         None


In [8]:
print(result)
#5

                  Linear SVR Linear SVR Non-Linear DecisionTree RandomForest
Linear          0.620124   0.457136       0.755437      0.77924     0.780135
SVR Linear      0.604508   0.456871       0.758236     0.776474     0.776745
SVR Non-Linear  0.674403   0.628206       0.897334     0.696181     0.815538
DecisionTree    0.686361   0.643365        0.90712     0.836806     0.845303
RandomForest        None       None           None         None         None


In [11]:
result
#4

Unnamed: 0,Linear,SVR Linear,SVR Non-Linear,DecisionTree,RandomForest
Linear,0.60401,0.457046,0.75093,0.776711,0.776492
SVR Linear,0.60401,0.457046,0.75093,0.776711,0.776492
SVR Non-Linear,0.671727,0.628963,0.891143,0.835247,0.8403
DecisionTree,0.681563,0.614992,0.897925,0.96711,0.923559
RandomForest,,,,,


In [14]:
result
#6

Unnamed: 0,Linear,SVR Linear,SVR Non-Linear,DecisionTree,RandomForest
Linear,0.624738,0.456874,0.781049,0.81723,0.814741
SVR Linear,0.610294,0.530043,0.792229,0.806415,0.807916
SVR Non-Linear,0.697365,0.665248,0.898321,0.782986,0.829427
DecisionTree,0.705126,0.670093,0.907012,0.839675,0.875221
RandomForest,,,,,
