# Naïve Bayes

In [14]:
import matplotlib.pyplot as  plt
import pandas as pd
import numpy as np
import sklearn.model_selection as cv    # Pel Cross-validation
import sklearn.neighbors as nb           # Per fer servir el knn

data=pd.read_csv('fake_job_preprocessed.csv')
data.head()

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,6.333616e-09,0.041026,0.0,0.02222222,5.1e-05,0.056281,0.056281,0,1,0,0.166667,0.068627,0.102041,0.004431315,0.017964,0
1,0.3003079,0.380666,0.06414,0.05628121,0.144389,0.056281,0.015136,0,0,0,0.041833,0.039746,0.012579,0.03703704,0.032536,0
2,0.05628121,0.0,0.056281,0.01513635,0.015136,0.056281,0.056281,0,1,1,0.05026,0.030513,0.027523,0.0,0.019231,0
3,0.01513635,0.002669,0.06414,1.160042e-10,0.144389,0.056281,0.056281,0,0,0,0.05026,0.030513,0.01743,0.01732102,0.01511,0
4,0.05628121,3e-06,0.034301,0.0001391622,0.015136,0.056281,0.056281,0,1,1,0.05026,0.145455,0.01743,7.816336e-13,0.017516,0


#### Podemos ver que el dataset está desbalanceado, ya que las clases "fraudulento" y "no fraudulento" no están representadas de forma equilibrada

In [None]:
fraudulent = (len(data[data.fraudulent == 1])/len(data.fraudulent))*100
noFraudulent = (len(data[data.fraudulent == 0])/len(data.fraudulent))*100
print('Representación de la clase "fraudulento":', str(round(fraudulent, 2))+'%')
print('Representación de la clase "no fraudulento":', str(round(noFraudulent, 2))+'%')

Representación de la clase "fraudulento": 5.63%
Representación de la clase "no fraudulento": 94.37%


#### Con K-Fold

In [None]:
X = data.values[:, 0:data.shape[1]-1]
y = data.values[:, data.shape[1]-1]

In [None]:
# Train on data. No parameters to adjust

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, train_test_split
import sklearn.neighbors as nb

cv = StratifiedKFold(n_splits=10) 

gnb = GaussianNB()
cv_scores = cross_val_score(gnb,X=X,y=y,cv=cv)

In [None]:
from sklearn.model_selection import cross_val_predict  
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

predicted = cross_val_predict(GaussianNB(), X=X, y=y,  cv=cv)  

print(confusion_matrix(y, predicted))
print(accuracy_score(y, predicted))

[[4754  159]
 [  30  263]]
0.9636957356895889


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y, predicted))

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      4913
         1.0       0.62      0.90      0.74       293

    accuracy                           0.96      5206
   macro avg       0.81      0.93      0.86      5206
weighted avg       0.97      0.96      0.97      5206



Obtenemos un f1-score de 0.861. Vamos a intentar mejorarla seleccionando un buen threshold para valores de probabilidad. 

## Threshold

In [None]:
(X_train, X_test,  y_train, y_test) = train_test_split(X, y, test_size=.3, random_state=1)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def filterp(th,ProbClass1):
    """ Given a treshold "th" and a set of probabilies of belonging to class 1 "ProbClass1", return predictions """ 
    y=np.zeros(ProbClass1.shape[0])
    for i,v in enumerate(ProbClass1):
        if (ProbClass1[i]>th).all():
            y[i]=1
    return y  

clf = GaussianNB()
lth=[]

# We do a 10 fold crossvalidation with 10 iterations
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X, y):
    X_train2, X_test2 = X[train_index], X[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]

    # Train with the training data of the iteration 
    clf.fit(X_train2, y_train2)
    # Obtaining porbablity predictions for test data of the iterarion
    probs = clf.predict_proba(X_test2)
    # Collect probabilities of belonging to class 1
    ProbClass1 = probs[:,1]
    # Sort probabilities and generate pairs (threshold, f1-for-that-threshold) 
    res = np.array([[th,f1_score(y_test2,filterp(th,ProbClass1),pos_label=1)] for th in np.sort(ProbClass1)])

    # Uncomment the following lines if you want to plot at each iteration how f1-score evolves increasing the threshold 
    #plt.plot(res[:,0],res[:,1])
    #plt.show()

    # Find the threshold that has maximum value of f1-score
    maxF = np.max(res[:,1])
    optimal_th = res[res[:,1]==maxF,0]
    
    # Store the optimal threshold found for the current iteration
    lth.append(optimal_th)

# Compute the average threshold for all 10 iterations    
thdef = np.mean(lth)
print("Selected threshold in 10-fold cross validation:", thdef)
print()

# Train a classifier with the whole training data 
clf.fit(X_train, y_train)
# Obtain probabilities for data on test set
probs = clf.predict_proba(X_test)
# Generate predictions using probabilities and threshold found on 10 folds cross-validation
pred = filterp(thdef,probs[:,1])
# Print results with this prediction vector
print(classification_report(y_test, pred))

# Ignore warnings explaining that in some iterations f1 score is 0

Selected threshold in 10-fold cross validation: [0.99998176 0.99998176]

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.98      1467
         1.0       0.70      0.85      0.77        95

    accuracy                           0.97      1562
   macro avg       0.84      0.91      0.88      1562
weighted avg       0.97      0.97      0.97      1562

  arr = asanyarray(a)


Hemos conseguido aumentar la f1 de Naive Bayes de 0.86 a 0.88 ajustando la probabiliad del threshold.

### Con cross-validation

In [15]:
X = data.values[:, 0:data.shape[1]-1]
y = data.values[:, data.shape[1]-1]

In [20]:
from sklearn import metrics
import sklearn.model_selection as cv
from sklearn.model_selection import StratifiedShuffleSplit

(X_train, X_test,  y_train, y_test) = cv.train_test_split(X, y, test_size=.3, random_state=1)

#split = StratifiedShuffleSplit(n_splits=1)

# Create a Naïve Bayes classifier object
knc = GaussianNB()

# Train the classifier
knc.fit(X_train, y_train)
y_pred = knc.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      1467
         1.0       0.63      0.92      0.75        95

    accuracy                           0.96      1562
   macro avg       0.81      0.94      0.86      1562
weighted avg       0.97      0.96      0.97      1562



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=91572d65-ca52-49ec-8072-8fb246390be4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>