# Criticism Needed

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib
from lightgbm import LGBMRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv("../input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
df.head()

In [None]:
df.info()

# Feature engineering

### we are going to replace incorrect values and fill missing values

In [None]:
df.salary_range = df.salary_range.fillna("0-0")

In [None]:
df.salary_range.replace({'40000':'40000-40000','Oct-15':'102-102','9-Dec':'102-102','3-Apr':'102-102','4-Apr':'102-102','8-Sep':'102-102','4-Jun':'102-102','10-Oct':'102-102','Oct-20':'102-102','Jun-18':'102-102','11-Nov':'102-102','11-Dec':'102-102','2-Apr':'102-102','2-Jun':'102-102','Dec-25':'102-102','10-Nov':'102-102'}, inplace=True)

In [None]:
df.salary_range.replace({'102-102':'0-0'}, inplace=True)

In [None]:
df.location.fillna("NO LOCATION", inplace=True)
df.department.fillna("NO DEPARTMENT", inplace=True)
df.company_profile.fillna("NO PROFILE", inplace=True)
df.description.fillna("NO DESCRIPTION", inplace=True)
df.requirements.fillna("NO REQUIREMENTS", inplace=True)
df.benefits.fillna("NO BENEFITS", inplace=True)
df.employment_type.fillna("NO EMP_TYPE", inplace=True)
df.required_experience.fillna("NO EXPERIENCE", inplace=True)
df.required_education.fillna("NO EDUCATION", inplace=True)
df.industry.fillna("NO INDUSTRY", inplace=True)
df.function.fillna("NO FUNCTIONS", inplace=True)
df.telecommuting.replace({0:"NO telecommuting",1:"YES telecommuting"},inplace=True)
df.has_company_logo.replace({0:"NO logo",1:'YES logo'},inplace=True)
df.has_questions.replace({0:"NO questions",1:"Yes question"},inplace=True)

### Function to get probability of output from each columns
ive used count vectorizer on the strings and then used logistic regression on it to get the predicted probability

In [None]:
# function to get probability of efeatures

def get_feat_prob(X,y):
    # vectorize
    vec = CountVectorizer(max_features=10000)
    vec.fit(X)
    trn,tst,trn_o,tst_o = train_test_split(X, y, test_size=0.4, random_state=20)
    # transform
    trn_abs = vec.transform(trn)
    tst_abs = vec.transform(tst)
    X_abs = vec.transform(X)
    # fit to model
    clf = LogisticRegression(C = 10, n_jobs=-1)
    clf.fit(trn_abs, trn_o)
    # predict score
    tst_preds = clf.predict(tst_abs)
    print("score: ", f1_score(tst_o, tst_preds, average='micro'))
    print("confusion matrix: \n\n", confusion_matrix(tst_o, tst_preds))
    # predict probabilities and make a dataframe of 1's probability
    all_preds = clf.predict_proba(X_abs)
    all_preds_df = pd.DataFrame(all_preds, columns=['zero','one'])
          
    return all_preds_df['one']
#     return clf
    
y = df['fraudulent']

In [None]:
# i have a better idea... lets check for each of the categorical columns first...
each_proba = []
for col in df.iloc[:,:18]:
    if df[col].dtype == 'object':
        print("\n-----------",col,"-----------")
        each_proba.append(get_feat_prob(df[col],y))

In [None]:
# make list of df's into single dataframe
each_proba_df = pd.concat(each_proba[:7] ,axis=1) 

In [None]:
each_proba_df.columns = df.columns[1:8]

In [None]:
each_proba_df.head()

I forgot to use benefits, but then when i added it i noticed that the accuracy decreased so i continued without **benefits** column

and the rest of the columns haven't given good results as shown above so i have used them in another way below


 i haven't used **'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education'** columns so i thought of using them here... it did improve our accuracy in predicting the fraud data to 95% so i have kept and used this... ( overall accuracy was 99% in both fraud and not fraud)

In [None]:
left = df.iloc[:,9:-3]
left.head()

X_left = pd.get_dummies(left)
y = df.fraudulent

X_trainl,X_testl,y_trainl,y_testl = train_test_split(X_left,y, test_size=0.3,random_state=42)

clf_left = XGBClassifier()
clf_left.fit(X_trainl,y_trainl)
print(clf_left.score(X_testl,y_testl))
print(confusion_matrix(y_testl,clf_left.predict(X_testl)))

X_col_left = pd.DataFrame(clf_left.predict(X_left))

### The below function is uses clustering approach to cluster our data into how many ever groups it can (in our case 2) and apply the passes models onto each cluster so that we can see what works best

In [None]:
def clustering_approach(X,y, models,type = "none"):
    
    dfs = {}
    X_cls = {}
    y_cls = {}
    X_scaled = {}
    X_train, X_test, y_train, y_test = {},{},{},{}
    y_pred = {}
    models_out = {}
    
    # create knn model and predict
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X,y)
    df = pd.concat([X,y],axis=1) # so we can later separate x and y for each cluster
    df['knn_clf'] = knn_clf.predict(X)
    no_cls = knn_clf.classes_
    
    # get the dataframes, apply std.scaler, form train, test sets, apply models
    for cls in knn_clf.classes_:
        print("--------------The {} cluster's results-------------------".format(cls),end="\n\n")
        dfs[cls] = df[df['knn_clf'] == cls].iloc[:,:-1]
        
        X_cls[cls] = dfs[cls].iloc[:,:-1]
        y_cls[cls] = dfs[cls].iloc[:,-1]
        scaler = StandardScaler()
        X_scaled[cls] = scaler.fit_transform(X_cls[cls])
#         X_scaled[cls] = pd.DataFrame(X_scaled[cls],columns=df.columns[:-1])
        
        X_train[cls],X_test[cls],y_train[cls],y_test[cls] = train_test_split(X_scaled[cls],y_cls[cls],test_size=0.4,random_state=42)
        print(y_train[cls].value_counts())
        print(y_test[cls].value_counts())
        
        # type can be used for analyzing... eg: confusion matrix
        for model in models:
            model.fit(X_train[cls], y_train[cls])
            y_pred[cls] = model.predict(X_test[cls])
            print(model)
            print(model.score(X_test[cls],y_test[cls]))
            print(confusion_matrix(y_pred[cls], y_test[cls]), end="\n\n")
            print((cross_val_score(model,X_scaled[cls],y_cls[cls])).mean())
            models_out[str(model) + str(cls)] = model
            
    
    return [X_train, X_test, y_train, y_test,knn_clf, models_out]
        

In [None]:
X_new = pd.concat([each_proba_df,X_col_left], axis=1)

In [None]:
models = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier(), XGBClassifier(verbosity = 0),LGBMClassifier(),SVC(), GaussianNB()] 

X_train, X_test, y_train, y_test,clusterer, models = clustering_approach(X_new,y,models)

#### Ive chosen to go with GaussianNB and LGBMClassifier for respective clusters

In [None]:
m = [[6577 +3, 2+11],[ 218+9, 312+20]]


In [None]:
(m[0][0]+m[1][1])/(m[0][0]+m[0][1]+m[1][1]+m[1][0])

# accuracy

In [None]:
(m[1][1])/(m[0][1]+m[1][1])

# accuracy of predicting frauds correctly

## Does anyone have any suggestions on which of the above 2 models would be best and why?

I've selected GaussianNB and LGBMClassifier because they are increasing the accuracy of predicting frauds correctly... using others will increase our general accuarcy to 99.4% or 99.5% but our accuracy of predicting frauds will go down to 90% or 91%

our goal should be to protect people from frauds then to classify some real but maybe in properly described job offers as real

## Thank you

### Do leave an upvote if this was worth your time