In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import pickle


In [2]:
# Function for Recursive Feature Elimination
def rfeFeature(indep_X, dep_Y, n):
#creating a list to do RFE for all the models
   rfelist=[]
   log_model = LogisticRegression(solver='lbfgs',max_iter=1000)
   RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
   DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt',splitter='best',random_state=0)
   svc_model=SVC(kernel='linear', random_state=0)
   rfemodellist=[log_model,RF,DT,svc_model]
#using for loop, determining feature using RFE for the required models
   for i in rfemodellist:
       print(i)
       log_rfe = RFE(estimator=i, n_features_to_select=n)
       log_fit=log_rfe.fit(indep_X, dep_Y)
       rfelist.append(log_rfe.support_)
   return rfelist

# Function to split test and train from the dataset and to preprocess the input using standard scaler
def split_scalar(indep_X,dep_Y):
    X_train,X_test,Y_train,Y_test=train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    # Feature scaling
    from sklearn.preprocessing import StandardScaler
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test


# Function for Confusion matrix and accuracy
def cm_prediction(classifier,X_test):
    y_pred=classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(Y_test, y_pred)
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(Y_test,y_pred)
    report=classification_report(Y_test,y_pred)
    return classifier,Accuracy,report,X_test,Y_test,cm

# Function for Logistic model
def logistic(X_train,Y_train,X_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy, report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

# Funcrion for svm linear model
def svm_linear(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='linear', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test,cm

# Function for svm non linear model

def svm_NL(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for Naive bayes model

def Naive(X_train,Y_train,X_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm 

# function for knn model
def knn(X_train, Y_train, X_test):
    from sklearn.neighbors  import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for Decision tree
def Decision(X_train, Y_train,X_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for Random forest 
def random(X_train, Y_train, X_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,Accuracy,report, X_test,Y_test, cm = cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# Function for preparing table for the RFE with best accuracy for the models
def RFE_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    rfedataframe=pd.DataFrame(index=['Logistic','SVC','Random','DecisionTree'],columns=['Logistic', 'SVML','SVMnl','KNN','Naive',
                                                                                        'Decision','Random'])
    for number,idex in enumerate(rfedataframe.index):
        rfedataframe['Logistic'][idex]=acclog[number]
        rfedataframe['SVML'][idex]=accsvml[number]
        rfedataframe['SVMnl'][idex]=accsvmnl[number]
        rfedataframe['KNN'][idex]=accknn[number]
        rfedataframe['Naive'][idex]=accnav[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
    return rfedataframe

    

In [3]:
dataset1=pd.read_csv("prep.csv",index_col=None)


In [4]:
df2=dataset1

In [5]:
df2=pd.get_dummies(df2, drop_first=True)

In [6]:
indep_X=df2.drop('classification_yes',axis=1)

In [7]:
dep_Y=df2['classification_yes']

In [8]:
indep_X

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,True,False,False,False,False,False,True,False,False
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,True,False,False,False,False,False,True,False,False
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,True,False,False,False,False,False,True,False,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,True,False,False,False,False,False,True,False,False
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,True,False,False,True,True,False,True,False,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,True,False,False,True,True,False,False,False,False
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,True,False,False,True,True,False,True,False,True


In [9]:
dep_Y

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [17]:
rfelist=rfeFeature(indep_X,dep_Y,4)

LogisticRegression(max_iter=1000)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
DecisionTreeClassifier(max_features='sqrt', random_state=0)
SVC(kernel='linear', random_state=0)


In [18]:
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]


In [19]:
for i in rfelist:
    X_selected=indep_X.iloc[:,i]
    X_train,X_test,Y_train,Y_test=split_scalar(X_selected,dep_Y)
    classifier,Accuracy,report,X_test,Y_test,cm =logistic(X_train,Y_train,X_test)
    acclog.append(Accuracy)
    classifier,Accuracy,report,X_test,Y_test,cm = svm_linear(X_train,Y_train,X_test)
    accsvml.append(Accuracy)
    classifier,Accuracy,report,X_test,Y_test,cm = svm_NL(X_train,Y_train,X_test)
    accsvmnl.append(Accuracy)
    classifier,Accuracy,report,X_test,Y_test,cm = Naive(X_train,Y_train,X_test)
    accnav.append(Accuracy)
    classifier,Accuracy,report,X_test,Y_test,cm = knn(X_train,Y_train,X_test)
    accknn.append(Accuracy)
    classifier,Accuracy,report,X_test,Y_test,cm = Decision(X_train,Y_train,X_test)
    accdes.append(Accuracy)
    classifier,Accuracy,report,X_test,Y_test,cm = random(X_train,Y_train,X_test)
    accrf.append(Accuracy)

result = RFE_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  rfedataframe['Logistic'][idex]=acclog[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFr

In [16]:
result
#3

Unnamed: 0,Logistic,SVML,SVMnl,KNN,Naive,Decision,Random
Logistic,0.94,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.91,0.92,0.93,0.93,0.86,0.91,0.94
Random,0.93,0.93,0.94,0.95,0.74,0.95,0.97
DecisionTree,0.87,0.87,0.87,0.87,0.87,0.87,0.87


In [20]:
result
#4

Unnamed: 0,Logistic,SVML,SVMnl,KNN,Naive,Decision,Random
Logistic,0.95,0.95,0.95,0.95,0.95,0.95,0.95
SVC,0.93,0.93,0.94,0.93,0.91,0.91,0.94
Random,0.97,0.97,0.97,0.96,0.84,0.96,0.96
DecisionTree,0.96,0.96,0.96,0.96,0.96,0.96,0.96


In [23]:
result
#5

Unnamed: 0,Logistic,SVML,SVMnl,KNN,Naive,Decision,Random
Logistic,0.98,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.97,0.97,0.97,0.96,0.87,0.93,0.97
Random,0.97,0.98,0.98,0.98,0.91,0.96,0.98
DecisionTree,0.99,0.99,0.99,0.99,0.99,0.99,0.99


In [91]:
result
#6

Unnamed: 0,Logistic,SVML,SVMnl,KNN,Naive,Decision,Random
Logistic,0.98,0.98,0.98,0.98,0.98,0.99,0.98
SVC,0.98,0.98,0.99,0.97,0.93,0.96,0.96
Random,0.97,0.99,0.98,0.98,0.91,0.97,1.0
DecisionTree,0.99,0.99,0.99,0.99,0.99,0.99,0.99
