## LOADING AND PREPROCESSING

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("/Users/saitejatangudu/Desktop/DATASETS/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score

In [3]:
Impute_data = data.drop(['Outcome','Pregnancies'],axis = 1)
columns = Impute_data.columns
columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [4]:
for i in Impute_data.columns:
    Impute_data[i] = np.where(Impute_data[i]==0,np.nan,Impute_data[i])

In [5]:
for i in data.columns:
    if i in columns:
        data[i]=Impute_data[i]
columns = data.columns
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50.0,1
1,1,85.0,66.0,29.0,,26.6,0.351,31.0,0
2,8,183.0,64.0,,,23.3,0.672,32.0,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [6]:
from sklearn.impute import KNNImputer
Imputer = KNNImputer(n_neighbors=5)
data= pd.DataFrame(Imputer.fit_transform(data))
data.columns=columns
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,169.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,58.6,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,25.8,164.6,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [7]:
# import module
from sklearn.preprocessing import MinMaxScaler
# scale features
scaler = MinMaxScaler()
model=scaler.fit(data)
data=model.transform(data)

# print scaled features
data = pd.DataFrame(data,columns = columns)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.670968,0.489796,0.304348,0.186298,0.314928,0.234415,0.483333,1.0
1,0.058824,0.264516,0.428571,0.23913,0.053606,0.171779,0.116567,0.166667,0.0
2,0.470588,0.896774,0.408163,0.204348,0.18101,0.104294,0.253629,0.183333,1.0
3,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.0,0.0
4,0.0,0.6,0.163265,0.304348,0.185096,0.509202,0.943638,0.2,1.0


## DATA RESAMPLING

### GENERATING TEST DATA

In [8]:
X = data.drop(['Outcome'],axis=1)
y = data.Outcome

In [9]:
from sklearn.utils import Bunch
from src.measures import tlcm, degIR, degOver, imbalance_ratio, n_1_imb_mean, n_3_imb_mean

dataset = Bunch(data=np.array(X),target=np.array(y))
TLCM = tlcm(dataset)
print("TLCM",TLCM)

TLCM 0.48134328358208955


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [11]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

228 424 40 76


In [12]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))

0.536

In [13]:
train_data = x_train
train_label = y_train
test_data = x_test
test_label=y_test

In [14]:
def cost_weights(indices,y_train,k):
    from sklearn.preprocessing import MinMaxScaler
    class_0 = []
    class_1 = []
    data = pd.DataFrame(indices)
    data = data.set_index(data.columns[0])
    for i in data.index:
        dict1 = {}
        for j in data.iloc[i]:
            dict1[y_train.iloc[j]] = dict1.get(y_train.iloc[j],0)+1
        class_0.append(dict1.get(0,0))
        class_1.append(dict1.get(1,0))
    data["class_0"] = class_0
    data["class_1"] = class_1
    data["Class"] = y_train
    data["cost"] = np.where(data["Class"]==0,((data["class_1"]+1)/k)*sum(data["Class"]==1),((data["class_0"]+1)/k)*sum(data["Class"]==0))
    return data["cost"]

def AdaWeight(x_train,y_train,k):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=k).fit(x_train)
    dist, indices = nbrs.kneighbors(x_train)
    return cost_weights(indices,y_train,k)

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42)
#lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
#pred_label=lr_imb_model.predict(test_data)
from sklearn.model_selection import cross_validate
scoring = ['accuracy','precision','recall']
scores = cross_validate(lr_imb,X,y,cv=10,scoring=scoring)
print(scores['test_accuracy'].mean(),
scores['test_precision'].mean(),
scores['test_recall'].mean())

0.7591079972658921 0.6836148995931605 0.5856125356125356


In [None]:
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

In [22]:
lr_imb=RandomForestClassifier(random_state=42,class_weight={0:sum(np.array(y)==1),1:sum(np.array(y)==0)})
#lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
#pred_label=lr_imb_model.predict(test_data)
from sklearn.model_selection import cross_validate
scoring = ['accuracy','precision','recall']
scores = cross_validate(lr_imb,X,y,cv=10,scoring=scoring)
print(scores['test_accuracy'].mean(),
scores['test_precision'].mean(),
scores['test_recall'].mean())

0.759107997265892 0.6881029961675124 0.593019943019943


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42,class_weight={0:sum(np.array(train_label)==1),1:sum(np.array(train_label)==0)})
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7327586206896551
Precision score 0.6265060240963854
Recall score 0.65
F1 score 0.6046511627906976
Kappa score 0.4188752424046541
Gmean weighted score 0.6990641356965319
Roc auc score 0.7131578947368421
Precision -Recall score 0.5395749799518845


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42,class_weight="balanced")
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7241379310344828
Precision score 0.627906976744186
Recall score 0.675
F1 score 0.5869565217391305
Kappa score 0.41041931385006347
Gmean weighted score 0.6913395045554726
Roc auc score 0.7125
Precision -Recall score 0.559988755622189


In [20]:
from sklearn.utils import Bunch
from src.measures import tlcm, degIR, degOver, imbalance_ratio, n_1_imb_mean, n_3_imb_mean

dataset = Bunch(data=np.array(X),target=np.array(y))
TLCM = tlcm(dataset)
IR   = imbalance_ratio(dataset)
N1   = n_1_imb_mean(dataset)
N3   = n_3_imb_mean(dataset)

In [21]:
def cal_class_weight(measure,value):
    C_W = {}
    if measure=="IR":
        C_W[0] = round((1/value)*100,2)
        C_W[1] = 100
    else:
        C_W[0] = round(value*100,2)
        C_W[1] = 100
    return C_W

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42,class_weight=cal_class_weight('IR',imbalance_ratio(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7327586206896551
Precision score 0.6352941176470589
Recall score 0.675
F1 score 0.6
Kappa score 0.4255591054313099
Gmean weighted score 0.7001005963934201
Roc auc score 0.719078947368421
Precision -Recall score 0.5601724137931035


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42,class_weight=cal_class_weight('TLCM',tlcm(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7155172413793104
Precision score 0.6117647058823529
Recall score 0.65
F1 score 0.5777777777777777
Kappa score 0.3884984025559105
Gmean weighted score 0.6810651696117016
Roc auc score 0.7
Precision -Recall score 0.5393486590038314


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42,class_weight=cal_class_weight('N1',n_1_imb_mean(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7068965517241379
Precision score 0.5952380952380952
Recall score 0.625
F1 score 0.5681818181818182
Kappa score 0.36632390745501286
Gmean weighted score 0.6706792124858247
Roc auc score 0.6875
Precision -Recall score 0.5189067398119123


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42,class_weight=cal_class_weight('N3',n_3_imb_mean(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7068965517241379
Precision score 0.5952380952380952
Recall score 0.625
F1 score 0.5681818181818182
Kappa score 0.36632390745501286
Gmean weighted score 0.6706792124858247
Roc auc score 0.6875
Precision -Recall score 0.5189067398119123


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=RandomForestClassifier(random_state=42)
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel(),AdaWeight(train_data,train_label,k=5))
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.7586206896551724
Precision score 0.65
Recall score 0.65
F1 score 0.65
Kappa score 0.46578947368421053
Gmean weighted score 0.728191704082611
Roc auc score 0.7328947368421052
Precision -Recall score 0.5431896551724138
