In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy
import pickle

df = pd.read_csv('UpdatedResumeDataSet.csv')
df['Resume']=df['Resume'].str.replace(r'\n','')
df['Resume']=df['Resume'].str.replace(r'\r','')

In [9]:
nlp = spacy.load('en_core_web_lg')

def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct or token.like_email or token.like_url:
      continue
    filtered_tokens.append(token.lemma_)
  return ' '.join(filtered_tokens).strip().lower()

df['Processed_Resume'] = df['Resume'].apply(preprocess)

In [13]:
le = LabelEncoder()
le.fit(df['Category'])
df['Category'] = le.transform(df['Category'])

tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(df['Processed_Resume'])
processed_text = tfidf.transform(df['Processed_Resume'])
X_train, X_test, y_train, y_test = train_test_split(processed_text, df['Category'], test_size=0.2, shuffle=True)

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, average='weighted'))
print(recall_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='weighted'))

pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
pickle.dump(clf, open('clf.pkl', 'wb'))

0.9481865284974094
0.9546755489760671
0.9481865284974094
0.9438657891631533


  _warn_prf(average, modifier, msg_start, len(result))


In [120]:
# Bag of Words upto 3-grams
clf = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1, 3))), ('knn', KNeighborsClassifier())])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, average='weighted'))
print(recall_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='weighted'))

0.9585492227979274
0.9788191083527871
0.9585492227979274
0.9604158469170742


In [122]:
df['resume_vec'] = df['Processed_Resume'].apply(lambda x: nlp(x).vector)

In [126]:
# Spacy word embeddings
X_train, X_test, y_train, y_test = train_test_split(df['resume_vec'], df['Category'], test_size=0.2, shuffle=True)
X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

clf = KNeighborsClassifier()
clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred, average='weighted'))
print(recall_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='weighted'))

0.8704663212435233
0.8552437775235703
0.8704663212435233
0.8559849625627068
