In [59]:
# Import libraries
import numpy as np
import pandas as pd
import unicodedata
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score

In [51]:
# Open dataset
jobs_labeled = pd.read_csv('../job-etl/data/csv/job_20221116.csv', usecols=[0, 1, 13], nrows=600)
jobs_labeled.head()

Unnamed: 0,id,title,it_spec
0,3301070910,(Brazil) Junior Business Development Specialis...,0
1,3264776485,(Evento SEnEC) - Desenvolvedor SW Junior - Clo...,1
2,3306116559,(G) Técnico De Suporte - Júnior,1
3,3325496787,[Meios de pagamento] QA Júnior,1
4,3303776344,Academia SAP UTILITIES 2022,0


In [52]:
# Regex Preprocessing

# Lowercase titles
jobs_labeled['clean_title'] = jobs_labeled['title'].str.lower()

# Remove whitespaces
jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.strip()

# Remove digits
jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.replace('\d', '')

# Remove special characters
jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.replace('\(|\)|/|-|\[|\]|\,|:|%','')

# Remove unicode characters
jobs_labeled['clean_title'] = jobs_labeled['clean_title'].map(lambda x: unicodedata.normalize('NFD', x))
jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.encode('ascii', 'ignore')
jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.decode('utf-8')

jobs_labeled.sample(10)

  jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.replace('\d', '')
  jobs_labeled['clean_title'] = jobs_labeled['clean_title'].str.replace('\(|\)|/|-|\[|\]|\,|:|%','')


Unnamed: 0,id,title,it_spec,clean_title
587,3333849701,Remote Junior Python Developer Jobs,1,remote junior python developer jobs
401,3305317615,Engenheiro(a) de Produtos e Projetos Eletrônicos,1,engenheiroa de produtos e projetos eletronicos
503,3298020363,Programador .net Júnior,1,programador .net junior
155,3324234729,Analista de Testes Junior,1,analista de testes junior
317,3276627342,Desenvolvedor Front-End Júnior,1,desenvolvedor frontend junior
203,3315663206,App Dev Architecture - Sao Paulo,1,app dev architecture sao paulo
72,3278328551,Analista de Marketing Digital Junior,0,analista de marketing digital junior
191,3295013691,Analista rpa junior/pleno,1,analista rpa juniorpleno
13,3338928763,Analista Administrativo Junior,0,analista administrativo junior
561,3297653360,Remote Golang Developer Jobs,1,remote golang developer jobs


In [53]:
# Remove stopwords and lemmatize
def remove_stopwords(string):
    # Remove stopwords
    stopwordslist = set(stopwords.words(['english', 'portuguese', 'spanish']))
    meaningful_words = [i for i in string.split() if i not in stopwordslist]
    return ' '.join(meaningful_words).lower()


jobs_labeled['clean_title'] = jobs_labeled['clean_title'].apply(lambda x: remove_stopwords(x))
jobs_labeled.sample(10)

Unnamed: 0,id,title,it_spec,clean_title
530,3312290593,Remote Data Engineer Jobs,1,remote data engineer jobs
223,3315658998,App Dev Architecture - Sao Paulo,1,app dev architecture sao paulo
24,3282179708,Analista Contábil Junior - Espanhol Avançado,0,analista contabil junior espanhol avancado
152,3314239700,Analista de Testes,1,analista testes
200,3334690232,Analyst Support Junior,1,analyst support junior
246,3333421896,Assistente Fiscal,0,assistente fiscal
303,3331700984,Desenvolvedor de Backend Júnior,1,desenvolvedor backend junior
589,3312290620,Remote Junior Software Engineer Jobs,1,remote junior software engineer jobs
568,3333852513,Remote Junior Application Developer Jobs,1,remote junior application developer jobs
232,3271369106,Assistente De Compras Junior -Hortolândia - Sp,0,assistente compras junior hortolandia sp


In [54]:
# Split data into train an test sets
X_train, X_test, y_train, y_test = train_test_split(jobs_labeled['clean_title'],jobs_labeled['it_spec'],test_size=0.5,shuffle=True)

#Word2Vec
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]
X_train_tok

[['desenvolvedor', 'mobile', 'androidkotlin', 'senior'],
 ['remote', 'backend', 'developer', 'jobs'],
 ['app', 'dev', 'architecture', 'sao', 'paulo'],
 ['analista', 'contas', 'pagar', 'junior', 'espanhol', 'avancado'],
 ['remote', 'fullstack', 'developer', 'jobs'],
 ['analista', 'atendimento', 'junior'],
 ['analista', 'suporte', 'junior'],
 ['inside', 'sales', 'jr'],
 ['recreacionista', 'junior', 'atibaia', 'sp'],
 ['junior', 'software', 'architect', 'mckinsey', 'digital'],
 ['assistente', 'desenvolvimento', 'java'],
 ['desenvolvedor', 'net', 'windows', 'forms', 'junior', 'nh', 'rs'],
 ['analista', 'testes', 'junior'],
 ['programador', 'junior'],
 ['desenvolvedor', 'junior', 'fullstack'],
 ['assistente', 'administrativo', 'junior'],
 ['analista', 'contas', 'pagar', 'junior', 'espanhol', 'avancado'],
 ['remote', 'junior', 'c++', 'backend', 'engineer', 'jobs'],
 ['comercial', 'especialidade', 'seguranca', 'junior'],
 ['desenvolvedor', 'junior', 'laravelcssjs'],
 ['remote', 'react', 'nati

In [55]:
#Vectorize data
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

#building Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
        return self
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
jobs_labeled["clean_title_tok"] = [nltk.word_tokenize(i) for i in jobs_labeled['clean_title']]
model = Word2Vec(jobs_labeled["clean_title_tok"],min_count=1)     
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors)) 
modelw = MeanEmbeddingVectorizer(w2v)

# converting text to numerical data using Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)

In [60]:
# Logistic regression model for job classification (tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  

# Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.87      0.79      0.83        92
           1       0.91      0.95      0.93       208

    accuracy                           0.90       300
   macro avg       0.89      0.87      0.88       300
weighted avg       0.90      0.90      0.90       300

Confusion Matrix: [[ 73  19]
 [ 11 197]]
AUC: 0.9601797658862876


In [63]:
#Testing classification model on unlabeled dataset
jobs_unlabeled = pd.read_csv('../job-etl/data/csv/job_20221107.csv', usecols=[0, 1], skiprows=range(1,601))  #reading the data
jobs_unlabeled['clean_title'] = jobs_unlabeled['title'].apply(lambda x: remove_stopwords(x)) #preprocess the data
X_test = jobs_unlabeled['clean_title'] 
X_vector = tfidf_vectorizer.transform(X_test) #converting X_test to vector
y_predict = lr_tfidf.predict(X_vector)      #use the trained model on X_vector
y_prob = lr_tfidf.predict_proba(X_vector)[:,1].round(2)
jobs_unlabeled['predict_prob']= y_prob
jobs_unlabeled['auto_class']= y_predict
print(jobs_unlabeled.sample(10))
final=jobs_unlabeled[['id', 'clean_title','predict_prob', 'auto_class']].reset_index(drop=True)
final.to_csv('../job-etl/data/csv/job-auto-class.csv')

             id                                              title  \
103  3315659738                   App Dev Architecture - Sao Paulo   
77   3269594547             Profissional Desenvolvedor Java Junior   
90   3297404677                        Java Engineer - Remote Work   
58   3297650937                        Remote Golang Engineer Jobs   
143  3314276150                 Assistente de Desenvolvimento Java   
14   3271068609  Cod:. J.S - Analista De Controladoria Junior -...   
146  3297652550     Trabajos remotos de Ingeniero Líder Full Stack   
52   3325658033       Analista Contábil Júnior - Espanhol Avançado   
75   3304075129                      Work From Home Java Developer   
105  3301070910  (Brazil) Junior Business Development Specialis...   

                                           clean_title  predict_prob  \
103                   app dev architecture - sao paulo          0.98   
77              profissional desenvolvedor java junior          0.99   
90           

In [64]:
# Join two tables for label comparison
merged_df = jobs_labeled.merge(jobs_unlabeled, how='outer', suffixes=('', '_auto'))
merged_df = merged_df[['id', 'title', 'clean_title','it_spec', 'auto_class', 'predict_prob']].reset_index(drop=True)
merged_df['equal'] = merged_df['it_spec'] == merged_df['auto_class']

In [66]:
merged_df.info(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 716 entries, 0 to 715
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            716 non-null    int64  
 1   title         716 non-null    object 
 2   clean_title   716 non-null    object 
 3   it_spec       600 non-null    float64
 4   auto_class    150 non-null    float64
 5   predict_prob  150 non-null    float64
 6   equal         716 non-null    bool   
dtypes: bool(1), float64(3), int64(1), object(2)
memory usage: 34.4+ KB
