In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score,precision_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN
from sklearn.model_selection import cross_val_score
from statistics import mean
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from scipy.stats import friedmanchisquare
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [None]:
def decision_tree(X_train, X_test, y_train, y_test):
    dt=tree.DecisionTreeClassifier()
    score=cross_val_score(dt,X_train,y_train,cv=10)
    dt=dt.fit(X_train,y_train)
    y_pred=dt.predict(X_test)
    results(y_pred,y_test)

In [None]:
def random_forest(X_train, X_test, y_train, y_test):
    RC=RandomForestClassifier(n_estimators=10)
    score=cross_val_score(RC,X_train,y_train,cv=10)
    RC=RC.fit(X_train,y_train)
    y_pred=RC.predict(X_test)
    results(y_pred,y_test)

In [None]:
def Gaussian_NB(X_train, X_test, y_train, y_test):
    GNB=GaussianNB()
    score=cross_val_score(GNB,X_train,y_train,cv=10)
    GNB=GNB.fit(X_train,y_train)
    y_pred=GNB.predict(X_test)
    results(y_pred,y_test)

In [None]:
def Dummy_Classifier(X_train, X_test, y_train, y_test):
    DC=DummyClassifier()
    score=cross_val_score(DC,X_train,y_train,cv=10)
    DC=DC.fit(X_train,y_train)
    y_pred=DC.predict(X_test)
    results(y_pred,y_test)

In [None]:
def KNeighbors_Classifier(X_train, X_test, y_train, y_test):
    KN=KNeighborsClassifier()
    score=cross_val_score(KN,X_train,y_train,cv=10)
    KN=KN.fit(X_train,y_train)
    y_pred=KN.predict(X_test)
    results(y_pred,y_test)

In [None]:
def results(y_pred,y_test):
    q=confusion_matrix(y_test, y_pred)
    s=accuracy_score(y_test, y_pred)
    r=recall_score(y_test, y_pred)
    t=precision_score(y_test, y_pred)
    print("accu:",s)
    print("recall:",r)
    print("matrix:",q)
    print("precision:",t)

In [None]:
def sampling_over(o_X,o_y):
    smt = SMOTE()
    o_X, o_y = smt.fit_sample(o_X, o_y)
    print("oversample",np.bincount(o_y))
    return o_X,o_y
    

In [None]:
def sampling_under(u_X,u_y):
    nr = NearMiss()
    u_X, u_y = nr.fit_sample(u_X,u_y)
    print("undersample",np.bincount(u_y))
    return u_X,u_y

In [None]:
def sampling_balan(b_X,b_y):
    sme = SMOTEENN(random_state=42)
    b_X, b_y = sme.fit_resample(b_X, b_y)
    print("balanced_sampling",np.bincount(b_y))
    return b_X,b_y

In [None]:
def run():
    data = pd.read_csv('\\Users\\DELL\\Desktop\\dataset_diabetes\\diabetic_data8_yes.csv')
    y=data["readmitted"]
    X=data.drop(["readmitted","Unnamed: 0","Unnamed: 0.1","encounter_id","patient_nbr"],axis=1)
    print(X)
    print("original",np.bincount(y))
    #X,y=sampling_over(X,y)
    #X,y=sampling_under(X,y)
    X,y=sampling_balan(X,y)
    #X=PCA_A(X)
    o_X_train, o_X_test, o_y_train, o_y_test = train_test_split(X, y, random_state = 1, stratify=y,test_size=0.3)
    print("test_size",o_X_test.shape)
    print("train_size",o_X_train.shape)
    print("Decision tree")
    decision_tree(o_X_train, o_X_test, o_y_train, o_y_test)
    print("Gaussian")
    Gaussian_NB(o_X_train, o_X_test, o_y_train, o_y_test)
    print("Dummy_Classifier")
    Dummy_Classifier(o_X_train, o_X_test, o_y_train, o_y_test)
    print("K-Nearest Neighbor")
    KNeighbors_Classifier(o_X_train, o_X_test, o_y_train, o_y_test)
    print("random_forest")
    random_forest(o_X_train, o_X_test, o_y_train, o_y_test)
    
    

In [None]:
def PCA_A(X):
    pca = PCA(n_components=30).fit_transform(X)
    X = pd.DataFrame(data = pca, columns = ['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11','p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11','p1','p2','p3','p1','p2','p3','p4','p5'])
    #print(X)
    return X

In [None]:
run()

In [None]:
import Orange as orn
import matplotlib.pyplot as plt
names = ["Decision_Tree","Gaussian_Naive_Bayes","Dummy_Classifier","K-Nearest_Neighbor","Random_Forest" ]
avranks =  [3,4,5,1,2]
cd = orn.evaluation.compute_CD(avranks, 10) #tested on 30 datasets
print(cd)
orn.evaluation.graph_ranks(avranks, names, cd=cd, width=10, textspace=1.2)
plt.show()

In [None]:
from scipy import stats
KNN=[0.90,0.90,0.90,0.90,.90,0.88,0.86,0.87,0.86,0.87]
DT=[0.80,0.79,0.799,0.7,0.79,0.78,0.77,0.78,0.77,0.76]
NB=[0.655,0.655,0.655,0.655,0.65,0.64,0.65,0.64,0.65,0.64]
DC=[.54,0.55,0.55,0.54,0.53,0.52,0.51,0.50,0.54,0.55]
RF=[0.82,0.828,0.83,0.83,0.82,0.82,0.83,0.82,0.83,0.82]

In [None]:
DT_NB=stats.ttest_rel(DT,NB)
DT_NB

In [None]:
DT_DC=stats.ttest_rel(DT,DC)
DT_DC

In [None]:
DT_KNN=stats.ttest_rel(KNN,DT)
DT_KNN

In [None]:
DT_RF=stats.ttest_rel(DT,RF)
DT_RF


In [None]:
NB_DC=stats.ttest_rel(NB,DC)
NB_DC

In [None]:
NB_KNN=stats.ttest_rel(NB,KNN)
NB_KNN

In [None]:
NB_RF=stats.ttest_rel(NB,RF)
NB_RF

In [None]:
DC_KNN=stats.ttest_rel(DC,KNN)
DC_KNN

In [None]:
DC_RF=stats.ttest_rel(DC,RF)
DC_RF

In [None]:
KNN_RF=stats.ttest_rel(KNN,RF)
KNN_RF