In [1]:
import numpy as np
import pandas as pd

In [2]:
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score

In [3]:
names = ["COMPACTNESS",
"CIRCULARITY",
"DISTANCE CIRCULARITY",
"RADIUS RATIO",
"PR.AXIS ASPECT RATIO",
"MAX.LENGTH ASPECT RATIO",
"SCATTER RATIO",
"ELONGATEDNESS",
"PR.AXIS RECTANGULARITY",
"MAX.LENGTH RECTANGULARITY",
"SCALED VARIANCE ALONG MAJOR AXIS",
"SCALED VARIANCE ALONG MINOR AXIS",
"SCALED RADIUS OF GYRATION",
"SKEWNESS ABOUT MAJOR AXIS",
"SKEWNESS ABOUT MINOR AXIS",
"KURTOSIS ABOUT MINOR AXIS",
"KURTOSIS ABOUT MAJOR AXIS",
"HOLLOWS RATIO","class","empty"]

In [4]:
length = 0
data_frames = []
for i in ['a','b','c','d','e','f','g','h','i']:
    string="https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/vehicle/"+"xa"+str(i)+".dat"
    data = pd.read_csv(string,names=["sai"])
    data[names]=data["sai"].apply(lambda x :pd.Series(str(x).split(" ")))
    data_frames.append(data.drop(['sai','empty'],axis=1))

In [5]:
data = pd.concat(data_frames)
data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE CIRCULARITY,RADIUS RATIO,PR.AXIS ASPECT RATIO,MAX.LENGTH ASPECT RATIO,SCATTER RATIO,ELONGATEDNESS,PR.AXIS RECTANGULARITY,MAX.LENGTH RECTANGULARITY,SCALED VARIANCE ALONG MAJOR AXIS,SCALED VARIANCE ALONG MINOR AXIS,SCALED RADIUS OF GYRATION,SKEWNESS ABOUT MAJOR AXIS,SKEWNESS ABOUT MINOR AXIS,KURTOSIS ABOUT MINOR AXIS,KURTOSIS ABOUT MAJOR AXIS,HOLLOWS RATIO,class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [6]:
data.isna().sum()

COMPACTNESS                         0
CIRCULARITY                         0
DISTANCE CIRCULARITY                0
RADIUS RATIO                        0
PR.AXIS ASPECT RATIO                0
MAX.LENGTH ASPECT RATIO             0
SCATTER RATIO                       0
ELONGATEDNESS                       0
PR.AXIS RECTANGULARITY              0
MAX.LENGTH RECTANGULARITY           0
SCALED VARIANCE ALONG MAJOR AXIS    0
SCALED VARIANCE ALONG MINOR AXIS    0
SCALED RADIUS OF GYRATION           0
SKEWNESS ABOUT MAJOR AXIS           0
SKEWNESS ABOUT MINOR AXIS           0
KURTOSIS ABOUT MINOR AXIS           0
KURTOSIS ABOUT MAJOR AXIS           0
HOLLOWS RATIO                       0
class                               0
dtype: int64

In [7]:
data['class'] = np.where(data['class']=='van',1,0)

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
scaler = MinMaxScaler()
model = scaler.fit(data)
impute_data = model.transform(data)
data = pd.DataFrame(impute_data,columns=names[:-1])

In [10]:
data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE CIRCULARITY,RADIUS RATIO,PR.AXIS ASPECT RATIO,MAX.LENGTH ASPECT RATIO,SCATTER RATIO,ELONGATEDNESS,PR.AXIS RECTANGULARITY,MAX.LENGTH RECTANGULARITY,SCALED VARIANCE ALONG MAJOR AXIS,SCALED VARIANCE ALONG MINOR AXIS,SCALED RADIUS OF GYRATION,SKEWNESS ABOUT MAJOR AXIS,SKEWNESS ABOUT MINOR AXIS,KURTOSIS ABOUT MINOR AXIS,KURTOSIS ABOUT MAJOR AXIS,HOLLOWS RATIO,class
0,0.478261,0.576923,0.597222,0.323144,0.274725,0.150943,0.326797,0.457143,0.25,0.585714,0.242105,0.233813,0.471698,0.144737,0.272727,0.390244,0.366667,0.533333,1.0
1,0.391304,0.307692,0.611111,0.161572,0.10989,0.132075,0.24183,0.542857,0.166667,0.357143,0.210526,0.17506,0.308176,0.171053,0.409091,0.341463,0.433333,0.6,1.0
2,0.673913,0.653846,0.916667,0.458515,0.208791,0.150943,0.620915,0.171429,0.5,0.571429,0.489474,0.540767,0.698113,0.184211,0.636364,0.219512,0.4,0.5,0.0
3,0.434783,0.307692,0.583333,0.240175,0.175824,0.132075,0.20915,0.571429,0.166667,0.357143,0.157895,0.14988,0.113208,0.052632,0.272727,0.243902,0.766667,0.866667,1.0
4,0.26087,0.423077,0.416667,0.441048,0.615385,0.943396,0.24183,0.542857,0.166667,0.371429,0.584211,0.169065,0.496855,0.894737,0.409091,0.268293,0.133333,0.066667,0.0


## DATA RESAMPLING

### GENERATING TEST DATA

In [11]:
X = data.drop(['class'],axis=1)
y = data['class']

In [12]:
from sklearn.utils import Bunch
from src.measures import tlcm, degIR, degOver, imbalance_ratio, n_1_imb_mean, n_3_imb_mean

dataset = Bunch(data=np.array(X),target=np.array(y))
TLCM = tlcm(dataset)
print("TLCM",TLCM)

TLCM 0.10552763819095477


In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=42)

In [13]:
print(sum(y_train==1)
,sum(y_train==0)
,sum(y_test==1)
,sum(y_test==0)
     )

168 551 31 96


In [14]:
(sum(y_train==1)+sum(y_test==1))/(sum(y_train==0)+sum(y_test==0))
     

0.3075734157650695

In [15]:
train_data = x_train
train_label = y_train
test_data = x_test
test_label=y_test

In [16]:
def cost_weights(indices,y_train,k):
    from sklearn.preprocessing import MinMaxScaler
    class_0 = []
    class_1 = []
    data = pd.DataFrame(indices)
    data = data.set_index(data.columns[0])
    for i in data.index:
        dict1 = {}
        for j in data.iloc[i]:
            dict1[y_train.iloc[j]] = dict1.get(y_train.iloc[j],0)+1
        class_0.append(dict1.get(0,0))
        class_1.append(dict1.get(1,0))
    data["class_0"] = class_0
    data["class_1"] = class_1
    data["Class"] = y_train
    data["cost"] = np.where(data["Class"]==0,((data["class_1"]+1)/k)*sum(data["Class"]==1),((data["class_0"]+1)/k)*sum(data["Class"]==0))
    return data["cost"]

def AdaWeight(x_train,y_train,k):
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=k).fit(x_train)
    dist, indices = nbrs.kneighbors(x_train)
    return cost_weights(indices,y_train,k)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42)
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9448818897637795
Precision score 0.8813559322033899
Recall score 0.8387096774193549
F1 score 0.9285714285714286
Kappa score 0.8455792947715824
Gmean weighted score 0.9389749100342787
Roc auc score 0.9089381720430109
Precision -Recall score 0.7945498748140354


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight={0:sum(np.array(train_label)==1),1:sum(np.array(train_label)==0)})
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9606299212598425
Precision score 0.923076923076923
Recall score 0.967741935483871
F1 score 0.8823529411764706
Kappa score 0.8966975760533593
Gmean weighted score 0.934272591454076
Roc auc score 0.9630376344086021
Precision -Recall score 0.8853860060661298


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight="balanced")
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9448818897637795
Precision score 0.898550724637681
Recall score 1.0
F1 score 0.8157894736842105
Kappa score 0.861245512720462
Gmean weighted score 0.9032106474595007
Roc auc score 0.9635416666666667
Precision -Recall score 0.870907583920431


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42)
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel(),AdaWeight(train_data,train_label,k=5))
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9606299212598425
Precision score 0.9206349206349206
Recall score 0.9354838709677419
F1 score 0.90625
Kappa score 0.8944656805717135
Gmean weighted score 0.9418975807547119
Roc auc score 0.952116935483871
Precision -Recall score 0.8714043053086106


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
from sklearn.utils import Bunch
from src.measures import tlcm, degIR, degOver, imbalance_ratio, n_1_imb_mean, n_3_imb_mean

dataset = Bunch(data=np.array(X),target=np.array(y))
TLCM = tlcm(dataset)
IR   = imbalance_ratio(dataset)
N1   = n_1_imb_mean(dataset)
N3   = n_3_imb_mean(dataset)

In [22]:
def cal_class_weight(measure,value):
    C_W = {}
    if measure=="IR":
        C_W[0] = round((1/value)*100,2)
        C_W[1] = 100
    else:
        C_W[0] = round(value*100,2)
        C_W[1] = 100
    return C_W

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('IR',imbalance_ratio(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.968503937007874
Precision score 0.9393939393939393
Recall score 1.0
F1 score 0.8857142857142857
Kappa score 0.918222794591114
Gmean weighted score 0.9411239481143202
Roc auc score 0.9791666666666667
Precision -Recall score 0.9172103487064117


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('TLCM',tlcm(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.9448818897637795
Precision score 0.898550724637681
Recall score 1.0
F1 score 0.8157894736842105
Kappa score 0.861245512720462
Gmean weighted score 0.9032106474595007
Roc auc score 0.9635416666666667
Precision -Recall score 0.870907583920431


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('N1',n_1_imb_mean(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.952755905511811
Precision score 0.911764705882353
Recall score 1.0
F1 score 0.8378378378378378
Kappa score 0.879848628192999
Gmean weighted score 0.9153348228041135
Roc auc score 0.96875
Precision -Recall score 0.8850819323260268


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,cohen_kappa_score,average_precision_score
lr_imb=LogisticRegression(random_state=42,class_weight=cal_class_weight('N3',n_3_imb_mean(dataset)))
lr_imb_model=lr_imb.fit(train_data,np.array(train_label).ravel())
pred_label=lr_imb_model.predict(test_data)
print(f"Accuracy score %s"%accuracy_score(pred_label,test_label))
print(f"Precision score %s"%f1_score(pred_label,test_label))
print(f"Recall score %s"%precision_score(pred_label,test_label))
print(f"F1 score %s"%recall_score(pred_label,test_label))
print(f"Kappa score %s"%cohen_kappa_score(pred_label,test_label))
print(f"Gmean weighted score %s"%geometric_mean_score(pred_label, test_label))
print(f"Roc auc score %s"%roc_auc_score(test_label,pred_label))
print(f"Precision -Recall score %s"%average_precision_score(pred_label,test_label))

Accuracy score 0.937007874015748
Precision score 0.8857142857142858
Recall score 1.0
F1 score 0.7948717948717948
Kappa score 0.8430160692212608
Gmean weighted score 0.8915558282417286
Roc auc score 0.9583333333333333
Precision -Recall score 0.8578639208560468
