In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
import pickle
import matplotlib.pyplot as plt

# Function to select kbest feature by passing independent and dependent variables from the dataset
def selectkbest(indep_X,dep_Y,n):
    test=SelectKBest(score_func=chi2,k=n)
    fit1=test.fit(indep_X,dep_Y)
    selectkfeatures = fit1.transform(indep_X)
    return selectkfeatures

# Splitting the dataset to test and train and preprocessing them using standardscaler
def split_scalar(indep_X,dep_Y):
  X_train, X_test, Y_train, Y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
  sc= StandardScaler()
  X_train=sc.fit_transform(X_train)
  X_test = sc.transform(X_test)
  return X_train, X_test, Y_train, Y_test

# Confusion matrix and accuracy prediction 
def cm_prediction(classifier,X_test):
    y_pred=classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(Y_test, y_pred)
    from sklearn.metrics import accuracy_score 
    from sklearn.metrics import classification_report
    Accuracy=accuracy_score(Y_test, y_pred)
    report=classification_report(Y_test, y_pred)
    return classifier, Accuracy, report, X_test, Y_test,cm

# Function to create a model using logisctic regression 
def logistic(X_train,Y_train,X_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test,Y_test,cm = cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for svm model
def svm_linear(X_train, Y_train, X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, Y_train)
    classifier,Accuracy, report, X_test, Y_test, cm =cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for svm non-linear model
def svm_NL(X_train, Y_train, X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm =cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for naive_bayes
def Navie(X_train,Y_train, X_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for knn
def knn(X_train, Y_train, X_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier= KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
    classifier.fit(X_train, Y_train)
    classifier,Accuracy,report,X_test,Y_test, cm = cm_prediction(classifier,X_test)
    return classifier,Accuracy,report,X_test,Y_test, cm

# function for decision tree
def Decision(X_train, Y_train, X_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier,X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm

# function for random forest tree
def random(X_train, Y_train, X_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train,Y_train)
    classifier, Accuracy, report, X_test, Y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, Y_test, cm
    
# function for displaying the selectkbest for all the algorithms
def selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf):
    dataframe=pd.DataFrame(index=['Chisquare'], columns=['Logistic','SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    for number,idex in enumerate(dataframe.index):
        dataframe['Logistic'][idex]=acclog[number]
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['Navie'][idex]=accnav[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe
    

In [99]:
dataset1=pd.read_csv("prep.csv",index_col=None)

In [100]:
df2=dataset1
df2

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [101]:
df2=pd.get_dummies(df2, drop_first=True)

In [102]:
df2

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,False,False,False,False,False,False,True,True,False,True
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,False,False,False,False,False,True,False,False,True
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,False,False,False,False,False,True,False,False,True
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,False,False,False,False,False,True,False,True,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,False,False,False,False,False,True,False,False,True
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,False,False,True,True,False,True,False,True,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,False,False,True,True,False,False,False,False,True
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,False,False,True,True,False,True,False,True,True


In [103]:
indep_X=df2.drop('classification_yes',axis=1)
dep_Y=df2['classification_yes']

In [104]:
indep_X

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes
0,2.000000,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,True,False,False,False,False,False,False,True,True,False
1,3.000000,76.459948,2.0,0.0,148.112676,22.000000,0.700000,137.528754,4.627244,10.700000,...,True,True,False,False,False,False,False,True,False,False
2,4.000000,76.459948,1.0,0.0,99.000000,23.000000,0.600000,138.000000,4.400000,12.000000,...,True,True,False,False,False,False,False,True,False,False
3,5.000000,76.459948,1.0,0.0,148.112676,16.000000,0.700000,138.000000,3.200000,8.100000,...,True,True,False,False,False,False,False,True,False,True
4,5.000000,50.000000,0.0,0.0,148.112676,25.000000,0.600000,137.528754,4.627244,11.800000,...,True,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,0.0,0.0,219.000000,36.000000,1.300000,139.000000,3.700000,12.500000,...,True,True,False,False,False,False,False,True,False,False
395,51.492308,70.000000,0.0,2.0,220.000000,68.000000,2.800000,137.528754,4.627244,8.700000,...,True,True,False,False,True,True,False,True,False,True
396,51.492308,70.000000,3.0,0.0,110.000000,115.000000,6.000000,134.000000,2.700000,9.100000,...,True,True,False,False,True,True,False,False,False,False
397,51.492308,90.000000,0.0,0.0,207.000000,80.000000,6.800000,142.000000,5.500000,8.500000,...,True,True,False,False,True,True,False,True,False,True


In [105]:
dep_Y

0       True
1       True
2       True
3       True
4       True
       ...  
394     True
395     True
396     True
397     True
398    False
Name: classification_yes, Length: 399, dtype: bool

In [130]:
kbest=selectkbest(indep_X, dep_Y, 6)

In [131]:
acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [132]:
X_train, X_test, Y_train, Y_test = split_scalar(kbest,dep_Y)

In [133]:
classifier,Accuracy,report,X_test,Y_test,cm = logistic(X_train, Y_train, X_test)
acclog.append(Accuracy)

In [134]:
classifier,Accuracy,report,X_test, Y_test, cm = svm_linear(X_train,Y_train,X_test)
accsvml.append(Accuracy)

In [135]:
classifier,Accuracy,report,X_test,Y_test,cm=svm_NL(X_train,Y_train,X_test)
accsvmnl.append(Accuracy)

In [136]:
classifier,Accuracy,report,X_test,Y_test,cm=Navie(X_train,Y_train,X_test)
accnav.append(Accuracy)

In [137]:
classifier,Accuracy,report,X_test,Y_test,cm = knn(X_train,Y_train,X_test)
accknn.append(Accuracy)

In [138]:
classifier,Accuracy,report,X_test,Y_test,cm = Decision(X_train,Y_train,X_test)
accdes.append(Accuracy)

In [139]:
classifier,Accuracy,report,X_test,Y_test,cm= random(X_train,Y_train,X_test)
accrf.append(Accuracy)

In [140]:
result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic'][idex]=acclog[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame

In [141]:
result
#6

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Chisquare,0.95,0.96,0.96,0.93,0.89,0.97,0.97


In [129]:
result
#4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Chisquare,0.85,0.82,0.83,0.86,0.79,0.89,0.89


In [117]:
result
#5

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random
Chisquare,0.94,0.94,0.95,0.89,0.83,0.96,0.95
