In [555]:
import pandas as pd # for data analytics
import numpy as np # for numerical computation
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, classification_report,confusion_matrix

# CM1 Software Prediction

## Importação e tratamento de dados

In [556]:
data = pd.read_csv("https://raw.githubusercontent.com/renatojmf/Machine-Learning/main/data.csv",encoding = "ISO-8859-1")
data["defects"] = data["defects"].astype(int)
data.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,b,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,2,2,2,2,1.2,1.2,1.2,1.2,1.4,0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1
2,24.0,5.0,1.0,3.0,63.0,309.13,0.11,9.5,32.54,2936.77,0.1,163.15,1,0,6,0,15.0,15.0,44.0,19.0,9.0,0
3,20.0,4.0,4.0,2.0,47.0,215.49,0.06,16.0,13.47,3447.89,0.07,191.55,0,0,3,0,16.0,8.0,31.0,16.0,7.0,0
4,24.0,6.0,6.0,2.0,72.0,346.13,0.06,17.33,19.97,5999.58,0.12,333.31,0,0,3,0,16.0,12.0,46.0,26.0,11.0,0


In [557]:
data.loc[data['defects'] == 1, "Class"] = -1
data.loc[data['defects'] == 0, "Class"] = 1

## Avaliação com treinamento da classe sem defeitos

In [558]:
non_fraud = data[data['Class']==1]
df_train, _ = train_test_split(non_fraud, test_size=0.2, random_state=42)
fraud = data[data['Class']==-1]

In [559]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [560]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [561]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([ 1,  1, -1,  1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,
       -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1, -1,
       -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1,  1,  1,
        1,  1,  1, -1, -1, -1, -1,  1,  1, -1, -1,  1,  1,  1,  1, -1,  1,
        1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1,  1,
       -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,
        1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1, -1, -1,  1,
       -1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,  1,
        1, -1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,
       -1, -1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1,
        1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,
       -1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,
        1, -1, -1,  1,  1

In [562]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 48, FP: 1 , FN:191, TP: 258


In [563]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.20      0.98      0.33        49
         1.0       1.00      0.57      0.73       449

    accuracy                           0.61       498
   macro avg       0.60      0.78      0.53       498
weighted avg       0.92      0.61      0.69       498



In [564]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.9961389961389961, recall is 0.5746102449888641 and F2 score is 0.6277372262773724


## Avaliação com 30% dos dados de treinamento da classe com defeitos

In [566]:
non_fraud = data[data['Class']==1]
fraud = data[data['Class']==-1]
df_train, _ = train_test_split(fraud, test_size=0.7, random_state=42)#treinamento com 30% do conjunto de defeitos

In [567]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [568]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [569]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [570]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 41, FP: 8 , FN:449, TP: 0


In [571]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.08      0.84      0.15        49
         1.0       0.00      0.00      0.00       449

    accuracy                           0.08       498
   macro avg       0.04      0.42      0.08       498
weighted avg       0.01      0.08      0.01       498



In [572]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.0, recall is 0.0 and F2 score is 0.0


## Avaliação com 40% dos dados de treinamento da classe com defeitos

In [573]:
non_fraud = data[data['Class']==1]
fraud = data[data['Class']==-1]
df_train, _ = train_test_split(fraud, test_size=0.6, random_state=42)#treinamento com 40% do conjunto de defeitos

In [574]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [575]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [576]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [577]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 32, FP: 17 , FN:449, TP: 0


In [578]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.07      0.65      0.12        49
         1.0       0.00      0.00      0.00       449

    accuracy                           0.06       498
   macro avg       0.03      0.33      0.06       498
weighted avg       0.01      0.06      0.01       498



In [579]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.0, recall is 0.0 and F2 score is 0.0


## Avaliação com 50% dos dados de treinamento da classe com defeitos

In [580]:
non_fraud = data[data['Class']==1]
fraud = data[data['Class']==-1]
df_train, _ = train_test_split(fraud, test_size=0.6, random_state=42)#treinamento com 50% do conjunto de defeitos

In [581]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [582]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [583]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [584]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 32, FP: 17 , FN:449, TP: 0


In [585]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.07      0.65      0.12        49
         1.0       0.00      0.00      0.00       449

    accuracy                           0.06       498
   macro avg       0.03      0.33      0.06       498
weighted avg       0.01      0.06      0.01       498



In [586]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.0, recall is 0.0 and F2 score is 0.0


# KC1 Software Prediction

## Importação e tratamento de dados

In [587]:
Kc1data = pd.read_csv("https://raw.githubusercontent.com/renatojmf/Machine-Learning/main/kc1Data.csv",encoding = "ISO-8859-1")
Kc1data["defects"] = Kc1data["defects"].astype(int)
Kc1data.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,b,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,2,2,2,2,1.2,1.2,1.2,1.2,1.4,0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1,1,1.0,1.0,1.0,1.0,1.0,1
2,83.0,11.0,1.0,11.0,171.0,927.89,0.04,23.04,40.27,21378.61,0.31,1187.7,65,10,6,0,18.0,25.0,107.0,64.0,21.0,1
3,46.0,8.0,6.0,8.0,141.0,769.78,0.07,14.86,51.81,11436.73,0.26,635.37,37,2,5,0,16.0,28.0,89.0,52.0,15.0,1
4,25.0,3.0,1.0,3.0,58.0,254.75,0.11,9.35,27.25,2381.95,0.08,132.33,21,0,2,0,11.0,10.0,41.0,17.0,5.0,1


In [588]:
Kc1data.loc[Kc1data['defects'] == 1, "Class"] = -1
Kc1data.loc[Kc1data['defects'] == 0, "Class"] = 1

## Avaliação com treinamento da classe sem defeitos

In [589]:
non_fraud = Kc1data[Kc1data['Class']==1]
df_train, _ = train_test_split(non_fraud, test_size=0.2, random_state=42)
fraud = Kc1data[Kc1data['Class']==-1]

In [590]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [591]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [592]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([ 1,  1,  1, ..., -1, -1, -1])

In [593]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 269, FP: 57 , FN:376, TP: 1407


In [594]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.42      0.83      0.55       326
         1.0       0.96      0.79      0.87      1783

    accuracy                           0.79      2109
   macro avg       0.69      0.81      0.71      2109
weighted avg       0.88      0.79      0.82      2109



In [595]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.9610655737704918, recall is 0.7891194615816041 and F2 score is 0.8184039087947883


## Avaliação com 30% dos dados de treinamento da classe com defeitos

In [596]:
non_fraud = Kc1data[Kc1data['Class']==1]
fraud = Kc1data[Kc1data['Class']==-1]
df_train, _ = train_test_split(fraud, test_size=0.7, random_state=42)#treinamento com 30% do conjunto de defeitos

In [597]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [598]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [599]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([-1, -1, -1, ..., -1, -1,  1])

In [600]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 257, FP: 69 , FN:1234, TP: 549


In [601]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.17      0.79      0.28       326
         1.0       0.89      0.31      0.46      1783

    accuracy                           0.38      2109
   macro avg       0.53      0.55      0.37      2109
weighted avg       0.78      0.38      0.43      2109



In [602]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.8883495145631068, recall is 0.30790802019068986 and F2 score is 0.35419354838709677


## Avaliação com 40% dos dados de treinamento da classe com defeitos

In [603]:
non_fraud = Kc1data[Kc1data['Class']==1]
fraud = Kc1data[Kc1data['Class']==-1]
df_train, _ = train_test_split(fraud, test_size=0.6, random_state=42)#treinamento com 30% do conjunto de defeitos

In [604]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [605]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [606]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([-1, -1,  1, ..., -1, -1, -1])

In [607]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 235, FP: 91 , FN:1111, TP: 672


In [608]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.17      0.72      0.28       326
         1.0       0.88      0.38      0.53      1783

    accuracy                           0.43      2109
   macro avg       0.53      0.55      0.40      2109
weighted avg       0.77      0.43      0.49      2109



In [609]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.8807339449541285, recall is 0.37689287717330344 and F2 score is 0.42558581380620647


## Avaliação com 50% dos dados de treinamento da classe com defeitos

In [610]:
non_fraud = Kc1data[Kc1data['Class']==1]
fraud = Kc1data[Kc1data['Class']==-1]
df_train, _ = train_test_split(fraud, test_size=0.5, random_state=42)#treinamento com 30% do conjunto de defeitos

In [611]:
# training model
model = OneClassSVM(kernel = 'rbf', gamma = 0.001, nu = 0.03).fit(df_train)

In [612]:
y_val = non_fraud['Class']
y_fraud = fraud['Class']
y_testval = pd.concat([y_val, y_fraud])
y_testval = np.array(y_testval)

In [613]:
df_testval = pd.concat([non_fraud,fraud])
y_pred = model.predict(df_testval)
y_pred

array([-1, -1, -1, ..., -1, -1, -1])

In [614]:
tn, fp, fn, tp = confusion_matrix(y_testval, y_pred).ravel()
print(f'TN: {tn}, FP: {fp} , FN:{fn}, TP: {tp}')

TN: 242, FP: 84 , FN:1206, TP: 577


In [615]:
print(classification_report(y_testval, y_pred,zero_division=1))

              precision    recall  f1-score   support

        -1.0       0.17      0.74      0.27       326
         1.0       0.87      0.32      0.47      1783

    accuracy                           0.39      2109
   macro avg       0.52      0.53      0.37      2109
weighted avg       0.76      0.39      0.44      2109



In [616]:
prec, rec, f2, _ = precision_recall_fscore_support(y_testval, y_pred, beta=2, 
                                                   pos_label=1, average='binary',zero_division=1)
print(f'precision is {prec}, recall is {rec} and F2 score is {f2}')

precision is 0.8729198184568835, recall is 0.3236118900729108 and F2 score is 0.37020402925702556
