# Experiment using 7 ML Algorithms

## Data Exploration

Loading data and importing necessary libararies

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Resume.csv')
df = df[['Category','Resume_str']]
df['Category'] = df['Category'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Category    2484 non-null   category
 1   Resume_str  2484 non-null   object  
dtypes: category(1), object(1)
memory usage: 22.7+ KB


## Preprocessing

In [3]:
import string
def clean_text(series):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    cleaned_texts = []

    for text in series:
        # Tokenization
        tokens = word_tokenize(text.lower())

        # Removing punctuation and numeric values
        no_punct_tokens = [token for token in tokens if token not in string.punctuation and not token.isnumeric()]

        # Removing stop words
        no_stopwords_tokens = [token for token in no_punct_tokens if token not in stop_words]

        # Lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in no_stopwords_tokens]

        # Join tokens back into a string
        cleaned_text = ' '.join(lemmatized_tokens)
        cleaned_texts.append(cleaned_text)

    return cleaned_texts

In [4]:
df['Text'] = clean_text(df['Resume_str'])

### Dataset splitting

In [5]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(df['Resume_str'].values,
                                                                            df['Category'].values,
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42)

### Making functions for model building, getting result and evaluate matrics

### MultinomialNB

(This section also include the building of MultinomialNB)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

In [7]:
multinomia_nb_score = model_0.score(test_sentences, test_labels)
print(f"Our baseline model achieves an accuracy of: {multinomia_nb_score*100:.2f}%")

Our baseline model achieves an accuracy of: 51.81%


In [8]:
def make_model(model):
    model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("choose_model",model ) # model the text
                ])
    model_0.fit(train_sentences, train_labels)
    return model_0

In [9]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [10]:
nb_preds = model_0.predict(test_sentences)

multinomia_nb__results = calculate_results(y_true=test_labels, y_pred=nb_preds)
multinomia_nb__results

{'accuracy': 51.80722891566265,
 'precision': 0.5393666574389466,
 'recall': 0.5180722891566265,
 'f1': 0.48272941812768694}

In [11]:
def make_result(model):
    predicts = model.predict(test_sentences)
    result = calculate_results(y_true=test_labels,y_pred=predicts)
    print(result)
    return result

In [12]:
make_result(make_model(MultinomialNB()))

{'accuracy': 51.80722891566265, 'precision': 0.5393666574389466, 'recall': 0.5180722891566265, 'f1': 0.48272941812768694}


{'accuracy': 51.80722891566265,
 'precision': 0.5393666574389466,
 'recall': 0.5180722891566265,
 'f1': 0.48272941812768694}

### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

make_result(make_model(LogisticRegression()))

{'accuracy': 65.06024096385542, 'precision': 0.6549401736148724, 'recall': 0.6506024096385542, 'f1': 0.6371566964386319}


{'accuracy': 65.06024096385542,
 'precision': 0.6549401736148724,
 'recall': 0.6506024096385542,
 'f1': 0.6371566964386319}

### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

make_result(make_model(RandomForestClassifier()))

{'accuracy': 69.47791164658635, 'precision': 0.7120860574971135, 'recall': 0.6947791164658634, 'f1': 0.6782839481557515}


{'accuracy': 69.47791164658635,
 'precision': 0.7120860574971135,
 'recall': 0.6947791164658634,
 'f1': 0.6782839481557515}

### AdaBoost

In [15]:
#!pip freeze > requirements.txt

In [16]:
from sklearn.ensemble import AdaBoostClassifier

make_result(make_model(AdaBoostClassifier()))

{'accuracy': 19.67871485943775, 'precision': 0.16689525194796279, 'recall': 0.19678714859437751, 'f1': 0.16439785285544636}


{'accuracy': 19.67871485943775,
 'precision': 0.16689525194796279,
 'recall': 0.19678714859437751,
 'f1': 0.16439785285544636}

### Extra Trees 

In [17]:
from sklearn.ensemble import ExtraTreesClassifier

make_result(make_model(ExtraTreesClassifier()))

{'accuracy': 59.43775100401606, 'precision': 0.5812470479214535, 'recall': 0.5943775100401606, 'f1': 0.5686895726941013}


{'accuracy': 59.43775100401606,
 'precision': 0.5812470479214535,
 'recall': 0.5943775100401606,
 'f1': 0.5686895726941013}

### KNeighbors

In [18]:
from sklearn.neighbors import KNeighborsClassifier

make_result(make_model(KNeighborsClassifier(n_neighbors=24)))

{'accuracy': 57.42971887550201, 'precision': 0.6182341677103332, 'recall': 0.5742971887550201, 'f1': 0.5580096193024687}


{'accuracy': 57.42971887550201,
 'precision': 0.6182341677103332,
 'recall': 0.5742971887550201,
 'f1': 0.5580096193024687}

### Bagging (best model)

In [19]:
from sklearn.ensemble import BaggingClassifier

bagging_model = make_model(BaggingClassifier())
make_result(bagging_model)

{'accuracy': 70.68273092369478, 'precision': 0.7018700732556153, 'recall': 0.7068273092369478, 'f1': 0.6927787327294306}


{'accuracy': 70.68273092369478,
 'precision': 0.7018700732556153,
 'recall': 0.7068273092369478,
 'f1': 0.6927787327294306}

### Random Sampling

Model loading for random sampling

In [21]:
import pickle
with open('model/bagging_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [22]:
import random
choices_sample= random.choices(df.values.tolist(), k=10)
choices_sample[0][2]

"rn asst head nurse practice leader skill care planning case management home health hospice infection control injection nurse manager oncology scheduling staff development trauma triage tutoring urology experience 09/2010 12/2011 company name hired adon assistant director nursing bed long term care mentally physically disabled child year upper 's older individual job included staffing nurse monthly setting transportation outside md office visit resident monitoring nursing unit day day care documentation resident taking call needed twice month mod entire building conducted in-services nursing staff answered page day question oversee critical change resident helped decision transfer hospital speaking md.helped staff nursing needed secured home health position rescare located oak park illinois 2015. made home visit overseeing client overall health basic assessment listened new complaint change status would call client doctor needed inform change status gave injections/ infusion needed par

In [23]:
for sample in choices_sample:
    text = sample[2]
    prediction = loaded_model.predict([text])
    print(f"prediction: {prediction[0]} and original: {sample[0]} ")

prediction: ADVOCATE and original: ADVOCATE 
prediction: ARTS and original: ARTS 
prediction: INFORMATION-TECHNOLOGY and original: INFORMATION-TECHNOLOGY 
prediction: ACCOUNTANT and original: ACCOUNTANT 
prediction: AGRICULTURE and original: PUBLIC-RELATIONS 
prediction: AVIATION and original: AVIATION 
prediction: INFORMATION-TECHNOLOGY and original: ARTS 
prediction: ADVOCATE and original: ADVOCATE 
prediction: BANKING and original: FITNESS 
prediction: ARTS and original: ARTS 


Thank you