##  [Real or Fake] : Fake Job Description Prediction


Этот набор данных содержит 18 тысяч описаний должностей, из которых около 800 - поддельные. Данные состоят из текстовой информации и метаинформации о вакансиях.

Задача - обучить модель классифицировать вакансии как реальные или мошеннические.

### О столбцах

1. job_id - уникальный идентификатор вакансии;
2. title - заголовок объявления;
3. location - географическое расположение объявления о работе;
4. department - корпоративный отдел (например, продажи);
5. salary_range - ориентировочный диапазон заработной платы (например, 50 000-60 000);
6. company_profile - краткое описание компании;
7. description - подробное описание объявления о работе;
8. requirements - перечислены требования для вакансии;
9. benefits - перечислены предлагаемые льготы;
10. telecommuting - верно для удаленных должностей;
11. has_company_logo - верно, если присутствует логотип компании;
12. has_questions - верно, если присутствуют проверочные вопросы;
13. employment_type - вид занятости;
14. required_experience - необходимый опыт;
15. required_education - необходимый образование;
16. industry - индустрия;
17. function - выполняемая функция;
18. fraudulent - показывает мошенническая ли вакансия.

### Импорт библиотек

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#!pip install wordcloud

In [None]:
from functools import reduce

import matplotlib.pyplot as plt

import seaborn as sns

import time

import re

import string

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression,LinearRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, model_selection, pipeline, tree
from sklearn.metrics import f1_score, roc_auc_score,classification_report,confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud,STOPWORDS

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

import warnings
warnings.filterwarnings("ignore")

### Загрузка датасета и его анализ

In [None]:
df = pd.read_csv('/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')

df.head()

In [None]:
df.describe()

In [None]:
df.info()

#### Вычислим процент пустых значений

In [None]:
pr = round(df.isna().sum()/len(df['job_id'])*100,1)
pr

#### Удалим столбцы с слишком большим количеством NaN, а также столбец job_id, так как этот столбец хранит только id вакансии.

In [None]:
indexes = list(pr.index)
for i in range(len(pr)):
    if pr[i]>50:
        print(indexes[i])
        del df[indexes[i]]

In [None]:
del df['job_id']


In [None]:
df.info()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(10,5))
sns.heatmap(df.corr(),ax=ax)

#### С корреляцией всё в порядке. 


#### Теперь избавимся от NaN в столбцах  и преобразуем все входные данные в один столбец

In [None]:
columns = list(df.columns)
columnsStr = []
colNotStr = []
for i in columns:
    
    if df[i].dtype != np.object and i !='fraudulent':
        colNotStr.append(i)
    #не влияет на качество модели, но увеличивает количество коэффициентов
    #df[i].fillna('nan'+i,inplace=True)

#чуть лучше
for i in colNotStr:
    df[i] = df[i].apply(lambda x:str(x)+i)

text = []
for i in range(len(df[columns[0]])):
    aa = list(df.iloc[i])
    aa = [' ' if type(i) != str else i for i in aa ]
    st = str(reduce(lambda x,y:x+y, aa))
    text.append(st)




len(text)

In [None]:
df.info()

In [None]:
def clean_text(text):
    ''' remove text in square brackets,remove links,remove punctuation.'''
    
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    
    return text


# Applying the cleaning function
text = [clean_text(i) for i in text ] 



In [None]:
df['text'] = text

In [None]:
for i in columns:
    if i !='fraudulent':
        del df[i]

In [None]:
df.head()

#### Я собрал все текстовые столбцы в один, причём заменяя значения NaN на пустые строки

In [None]:
df['text'][0]

In [None]:

plt.figure(figsize = (20,20)) # Text that is not fraudulent(0)
wc = WordCloud(width = 1600 , height = 800 , max_words = 3000).generate(" ".join(df[df.fraudulent == 0].text))
plt.imshow(wc , interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(width = 1600 , height = 800 , max_words = 3000).generate(" ".join(df[df.fraudulent == 1].text))
plt.imshow(wc , interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
sns.countplot(x = 'fraudulent',data = df)
print('количество фальшивых вакансий: ',len(df[df['fraudulent']==1]['fraudulent']))
print('количество правдивых вакансий: ',len(df[df['fraudulent']==0]['fraudulent']))

#### Фальшивых вакакнсий в датасете значительно меньше нормальных. Попробуем обучить модель с добавлением новых данных(методы передискретизации) и без этого. Сделаем это на моделе, которая покажет лучшие результаты относительно остальных на выборке без дополнительных данных.

RandomOverSampler, 
                                    SMOTE, 
                                    ADASYN

In [None]:



X = df['text']
y = df['fraudulent']


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
cv = CountVectorizer()
cv.fit(x_train)

len(cv.vocabulary_)

In [None]:
x_train = cv.transform(x_train)


In [None]:
x_test = cv.transform(x_test)

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(x_train, y_train)


print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm =  RandomOverSampler(random_state=2)
X_train_res2, y_train_res2 = sm.fit_sample(x_train, y_train)


print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm =   ADASYN(random_state=2)
X_train_res3, y_train_res3 = sm.fit_sample(x_train, y_train)


print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

## Обучение моделей

### LogisticRegression

#### Подбор параметров

In [None]:


def search(model,tuned_parameters,score = "f1"):


    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        model, tuned_parameters,n_jobs=-1, scoring='%s' % score
    )
    clf.fit(x_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(x_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
def visualize_coefficients(classifier, feature_names,coef = None, n_top_features=30):
    # get coefficients with large absolute values 
    if coef is None:
        coef = classifier.coef_.ravel()
    positive_coefficients = np.argsort(coef)[-n_top_features:]
    negative_coefficients = np.argsort(coef)[:n_top_features]
    interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
    # plot them
    plt.figure(figsize=(15, 5))
    colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]]
    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * n_top_features), feature_names[interesting_coefficients], rotation=60, ha="right");
    

In [None]:
def fitModel(model,x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test):
    start_time = time.time()
    lr = model
    lr.fit(x_train, y_train)
    tim = time.time() - start_time
    print("---time_fit model %s seconds ---" % (tim))
    preds = lr.predict(x_test)
    auc = roc_auc_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    report = classification_report(y_test,preds,target_names = ['0','1'])
    
    cm_1 = confusion_matrix(y_test,preds)
    cm_1 = pd.DataFrame(cm_1, index=[0,1], columns=[0,1])
    cm_1.index.name = 'Actual'
    cm_1.columns.name = 'Predicted'
    plt.figure(figsize = (10,10))
    sns.heatmap(cm_1,cmap= "Blues",annot = True, fmt='')
    return {'model':lr,'f1_score':f1,'roc_auc_score':auc,'report':report,'time_fit':tim}

In [None]:
parameters = [
                    {'penalty':['l2'], "solver":['newton-cg', 'lbfgs', 'liblinear'],
                     'C': [0.1,1, 10, 100, 1000]},
                   {'penalty':['none'], "solver":['newton-cg', 'lbfgs', 'liblinear'],
                     'C': [0.1,1, 10, 100, 1000]}]
search(LogisticRegression(),parameters)

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
l = fitModel(LogisticRegression(C=1,solver = 'liblinear'))
visualize_coefficients(l['model'], cv.get_feature_names())
listt = []
listt.append(l)

### SVC

In [None]:
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
                   {'kernel': ['poly'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]

search(SVC(),parameters)

In [None]:
l = fitModel(SVC(C=100,gamma=0.0001,kernel = 'rbf'))
listt.append(l)

### KNeighborsClassifier

In [None]:
parameters = {"n_neighbors":[3,5,10,15,20,30,50,80],"weights":["uniform","distance"],
                   "metric":["euclidean","manhattan"]}
search(KNeighborsClassifier(),parameters)

In [None]:
l = fitModel(KNeighborsClassifier(metric = 'manhattan',weights='distance',n_neighbors = 20))
listt.append(l)


### DecisionTreeClassifier

In [None]:
parameters = [{'criterion':['gini', 'entropy'],
               'splitter':['best', 'random'],
               'max_depth':[20,30,40,50,80],
               'max_features':[None,'auto'],
               'class_weight':['balanced',None]
              }]
search(DecisionTreeClassifier(),parameters)

In [None]:
l = fitModel(DecisionTreeClassifier(criterion = 'gini',max_depth = 80,splitter = 'best'))
listt.append(l)

In [None]:
fig = plt.gcf()
fig.set_size_inches(75,50)
tree.plot_tree(l['model'],feature_names = cv.get_feature_names(),filled=True,max_depth = 7,rounded = True,fontsize=40)
' '

### MultinomialNB

In [None]:
parameters = [{'alpha' : [1,0.1,0.01,0.001,0.0001],
               'fit_prior':[True,False]}]
search(MultinomialNB(),parameters)

In [None]:
l = fitModel(MultinomialNB(alpha=0.0001))

In [None]:
visualize_coefficients(l['model'], cv.get_feature_names())
listt.append(l)

### SGDClassifier

In [None]:


parameters = [{'loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
              'max_iter' : [100,1000,2000],
              'tol':[10**(-3),10**(-2),10**(-4)]}]
search(SGDClassifier(),parameters)

In [None]:
#{'loss': 'log', 'max_iter': 2000, 'tol': 0.001}
l = fitModel(SGDClassifier(loss='log',max_iter = 2000,tol = 0.001))
visualize_coefficients(l['model'], cv.get_feature_names())
listt.append(l)

### MLP 

Без подбора параметров, так как слишком долго обучается.

In [None]:

mlp = MLPClassifier(alpha= 0.01, hidden_layer_sizes= 8, max_iter= 1000, random_state= 0, solver= 'adam')
l = fitModel(mlp)

In [None]:
listt.append(l)

## Выбор лучшей модели

In [None]:
results = [[x[i] for x in listt] for i in ['model','f1_score', 'roc_auc_score','time_fit']]
model,f1_scor, auc_scor, time_fit = results
model = [str(i)[ :str(i).find("(")] for i in model]

In [None]:
indices = np.arange(len(f1_scor))


training_time = np.array(time_fit) / np.max(time_fit)


plt.figure(figsize=(12, 10))
plt.title("Score")

plt.barh(indices, f1_scor, .1, label="f1 score", color='navy')
plt.barh(indices + .15, auc_scor, .1, label="auc score", color='darkorange')
plt.barh(indices + .3, training_time, .1, label="training time",
         color='c')


plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices,model):
    plt.text(-.3, i, c)

plt.show()

In [None]:
auc_scor.index(max(auc_scor))

In [None]:
f1_scor.index(max(f1_scor))

In [None]:
auc_scor[4],auc_scor[6] 

In [None]:
f1_scor[4],f1_scor[6]

#### оценки почти одинаковые, посмотрим на время обучения

In [None]:
time_fit[4],time_fit[6]

In [None]:
model[4] #лучшая модель

## Обучение на различных данных

#### Посмотрим как ведёт себя модель при добавлении фальшивых данных.

In [None]:
listt2=[]

In [None]:

l = fitModel(MultinomialNB(alpha=0.0001),X_train_res,y_train_res)
listt2.append(l)

In [None]:
l = fitModel(MultinomialNB(alpha=0.0001),X_train_res2,y_train_res2)
listt2.append(l)

In [None]:
l = fitModel(MultinomialNB(alpha=0.0001),X_train_res3,y_train_res3)
listt2.append(l)

In [None]:
results = [[x[i] for x in listt2] for i in ['model','f1_score', 'roc_auc_score','time_fit']]
model,f1_scor, auc_scor, time_fit = results

In [None]:
listt[4]['f1_score']>f1_scor

In [None]:
listt[4]['roc_auc_score']>auc_scor

In [None]:
for i in listt2:
    print(i['report'])

In [None]:
print(listt[4]['report'])

#### Без добавления новых данных модель получается более точная.

## Проверка на переобучение

In [None]:
from matplotlib.patches import Patch
def plott(model,XX,yy,step=5000):

    ax = plt.subplot(111)
    red = Patch(color='red', label='Validation')
    green = Patch(color='green', label='Train')
    plt.legend(handles=[red,green])
    ax.set_title('Learning curve')
    
    ax.set_xlabel('Sample size')
    ax.set_ylabel('f1 score')
    ax.set_ylim(ymin=0, ymax=1.02)
    
    lis = list(range(500,len(XX),step))
    lis.append(len(XX))
    for i in lis:
        
        x_tr, x_te, y_tr, y_te = train_test_split(XX[:i], yy[:i], test_size=0.15, random_state=0)
        # 15% тестовая выборка
        x_tr1 = cv.transform(x_tr)
        x_te1 = cv.transform(x_te)
    
    
        accScore = []
        accScore2 = []
        scor = []
        for j in [1,3,5,10,24]:
            
            x_tr2, x_te2, y_tr2, y_te2 = train_test_split(x_tr, y_tr, test_size=0.1275, random_state=j)
            # еще 15% относительно первоначального объема это валидационная выборка
            x_tr2 = cv.transform(x_tr2)
            x_te2 = cv.transform(x_te2)
    
            model.fit(x_tr2,y_tr2)
            pre = model.predict(x_te2)
            pre2 = model.predict(x_te1)
            pr = model.predict(x_tr2)
            accScore.append(f1_score(y_te2,pre))
            accScore2.append(f1_score(y_te,pre2))
            scor.append(f1_score(y_tr2,pr))
        
    
        
        valScore = sum(accScore)/len(accScore)# f1 на валидационной выборке
        valScore2 = sum(accScore2)/len(accScore2)# f1 на тестовой выборке
        valScor = sum(scor)/len(scor) # f1 на тренировочной выборке
        print()
        print('average f1')
        print('on validation: ',valScore,
              'on test: ',valScore2,
              'on train: ',valScor)
        for ii in range(len(accScore)):
            ax.scatter(i,accScore[ii],color="red",alpha = 0.5)
            ax.scatter(i,scor[ii],color="green",alpha = 0.5)
    plt.show()

In [None]:
plott(listt[4]['model'],X,y,1000)

#### Похоже что переобучение отсутствует, так как оценки по тренировочной выборке и валидационным приблизились и не разошлись в конце. Хотя для обучения модели исходя из графика достаточно и более малой обучающей выборки.

#### Попробуем обучить модель с меньшей тренировочной выборкой.

In [None]:
model = listt[4]['model']
x_train2, x_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.35, random_state=0)
x_train2 = cv.transform(x_train2)
x_test2 = cv.transform(x_test2)
l = fitModel(model,x_train2,y_train2,x_test2,y_test2)

In [None]:
for i in ['f1_score','roc_auc_score','report']:
    print(i,l[i])

In [None]:
model

### Лучше всех оказалась модель MultinomialNB c гиперпараметром alpha=0.0001, и остальными по умолчанию.