In [None]:
pip install spacy

In [43]:
# remover .txt e .lab caso o .lab correspondente esteja vazio

import os
import shutil

# Set the source and destination directories
src_dir = 'E:\\Renato\\desktop-20230306\\resumes_corpus'
dest_dir = 'E:\\Renato\\desktop-20230306\\empty'

# Loop through all the files in the source directory
for file in [f for f in os.listdir(src_dir) if f.endswith(".lab")]:
    src_file = os.path.join(src_dir, file)
    dest_file = os.path.join(dest_dir, file)
    file_twin = file.split(".")[0] + ".txt"
    src_file_twin = os.path.join(src_dir, file_twin)
    dest_file_twin = os.path.join(dest_dir, file_twin)

    # Check if the file is empty
    if os.path.getsize(src_file) == 0:
        print(src_file)
        shutil.move(src_file, dest_file)
        shutil.move(src_file_twin, dest_file_twin)

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import spacy
import re
import unidecode
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

def createDataFrame(path, extension, label, delimiter):
    # Create an empty DataFrame
    df = pd.DataFrame()
    vazios = []

    docs_names = os.listdir(path)    
    txt_files = [d for d in docs_names if d.endswith(extension)]

    with tqdm(total=len(txt_files), file=sys.stdout, colour='red', \
        desc='Loading ') as pbar:

        # Loop through all the files in the directory
        for filename in os.listdir(path):
            if filename.endswith(extension):
                # Read the file into a DataFrame
                file_path = os.path.join(path, filename)
                if delimiter:
                    temp_df = pd.read_csv(file_path, delimiter='\t', encoding = 'ISO-8859-1', header=None)
                else:
                    temp_df = pd.read_csv(file_path, encoding = 'ISO-8859-1', header=None)
                # Add a new column with the filename
                temp_df['filename'] = filename.split(sep='.')[0]
            
                # Append the DataFrame to the main DataFrame
                df = pd.concat([df, temp_df])
                pbar.update(1)

    # Reset the index to start from 0
    df = df.reset_index(drop=True)
    df = df.rename(columns={0:label})
    df['filename']=df['filename'].astype(str)
    print(f"Vazios:  {len(vazios)}")
    return df

def preprocessing(txt):
    texts_pre = []
    texts_preprocessed = []
    spacy_stopwords = nlp.Defaults.stop_words
    stemmer = PorterStemmer()
    # convert all characters in the string to lower case
    txt = txt.lower()
    # remove non-english characters, punctuation and numbers
    txt = re.sub('[^a-zA-Z0-9]', ' ', txt) #
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace
    txt = txt.replace('\n', ' ')
    txt = unidecode.unidecode(txt)
    doc = nlp(txt)
    tokens = [t.text.lower() for t in doc]
    text = ' '.join(tokens) 
    text_without_stopword = [word for word in text.split() if word not in spacy_stopwords]
    txt_final = [stemmer.stem(w) for w in text_without_stopword]
    return txt_final


if __name__ == '__main__':
    path = 'E:\\Renato\\desktop-20230306\\resumes_corpus'
    extension = ".txt"
    label = 'resume'
    extension_2 = ".lab"
    label_2 = 'label'
    df = createDataFrame(path, extension, label, delimiter=True)
    df2 = createDataFrame(path, extension_2, label_2, delimiter=False)
    df2 = df2.groupby('filename').agg(lambda x: ' '.join(map(str, x))).reset_index()
    df2['label'] = df2['label'].str.split().str[0]
    df3 = df.merge(df2, on='filename', how='inner')

Loading : 100%|[31m██████████[0m| 29035/29035 [07:50<00:00, 61.71it/s]
Vazios:  0
Loading : 100%|[31m██████████[0m| 29035/29035 [02:04<00:00, 233.84it/s]
Vazios:  0


In [64]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29035 entries, 0 to 29034
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   resume    29035 non-null  object
 1   filename  29035 non-null  object
 2   label     29035 non-null  object
dtypes: object(3)
memory usage: 907.3+ KB


In [None]:
df3.head()

Unnamed: 0,resume,filename,label,Resume_nlp
0,"Database Administrator <span class=""hl"">Databa...",1,Database_Administrator,"['databas', 'administr', 'span', 'class', 'hl'..."
1,"Database Administrator <span class=""hl"">Databa...",2,Database_Administrator,"['databas', 'administr', 'span', 'class', 'hl'..."
2,Oracle Database Administrator Oracle <span cla...,3,Database_Administrator,"['oracl', 'databas', 'administr', 'oracl', 'sp..."
3,Amazon Redshift Administrator and ETL Develope...,4,Database_Administrator,"['amazon', 'redshift', 'administr', 'etl', 'de..."
4,Scrum Master Scrum Master Scrum Master Richmon...,5,Database_Administrator,"['scrum', 'master', 'scrum', 'master', 'scrum'..."


In [66]:
df3['label'].value_counts()

Software_Developer        5828
Systems_Administrator     4182
Project_manager           3527
Web_Developer             3466
Database_Administrator    2784
Java_Developer            2418
Python_Developer          2311
Network_Administrator     2260
Security_Analyst          2259
Name: label, dtype: int64

In [68]:
# get unique values of column A
unique_values = df3['label'].unique()

# create an empty dataframe to store the results
result_df = pd.DataFrame(columns=df3.columns)

# loop through the unique values and append 500 rows to the result dataframe for each value
for value in unique_values:
    temp_df = df3[df3['label'] == value].iloc[:500]
    result_df = result_df.append(temp_df, ignore_index=True)

  result_df = result_df.append(temp_df, ignore_index=True)


In [69]:
result_df['label'].value_counts()

Database_Administrator    500
Systems_Administrator     500
Project_manager           500
Software_Developer        500
Web_Developer             500
Security_Analyst          500
Network_Administrator     500
Java_Developer            500
Python_Developer          500
Name: label, dtype: int64

In [70]:
result_df['Resume_nlp'] = result_df['resume'].apply(lambda w: preprocessing(w)).astype(str)

In [71]:
# vectorize text data
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=None)
X_vect = vectorizer.fit_transform(result_df['Resume_nlp']).toarray()
#conuntvectorizer_test = vectorizer.transform(X_test).astype(float)

new_df = pd.DataFrame(X_vect, columns=vectorizer.get_feature_names_out())

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(new_df, result_df['label'], test_size = 0.2)

"""# **Random Forest Classifier**"""

RF_Model = RandomForestClassifier(random_state=42, n_estimators= 500, max_depth=8, criterion='gini')
RF_Model.fit(X_train, Y_train)

prediction=RF_Model.predict(X_test)
prediction

print("training Score: {:.2f}".format(RF_Model.score(X_train, Y_train)))
print("test Score: {:.2f}".format(RF_Model.score(X_test, Y_test)))

print("model report: %s: \n %s\n" % (RF_Model, metrics.classification_report(Y_test, prediction)))

"""Balancear somente o conjunto de treinamento"""

Hilário Tomaz
19:28
hilariooliveira

training Score: 0.84
test Score: 0.71
model report: RandomForestClassifier(max_depth=8, n_estimators=500, random_state=42): 
                         precision    recall  f1-score   support

Database_Administrator       0.84      0.66      0.74       101
        Java_Developer       0.91      0.93      0.92        90
 Network_Administrator       0.56      0.77      0.65        92
       Project_manager       0.66      0.73      0.69       117
      Python_Developer       0.99      0.90      0.94       106
      Security_Analyst       0.78      0.75      0.77       106
    Software_Developer       0.77      0.33      0.46       101
 Systems_Administrator       0.61      0.47      0.53        98
         Web_Developer       0.51      0.89      0.64        89

              accuracy                           0.71       900
             macro avg       0.74      0.71      0.71       900
          weighted avg       0.74      0.71      0.71       900




'Balancear somente o conjunto de treinamento'