In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("../csv/data.csv", header=None)
df.columns = ('category', 'text', 'postid')

In [73]:
df.head()

Unnamed: 0,category,text,postid
0,general labor,FERRARA CANDY *** PACKAGING ASSISTANTS *** Dek...,7405032484
1,customer service,Office Customer Service Part - Time 24 hrs/wk ...,7405099747
2,resumes / job wanted,quality tech control (west burbs) Manufacturin...,7396174532
3,resumes / job wanted,Driver Job Wanted (Chicago) CDL-Class-B Lookin...,7400075704
4,transportation,$400/DAY Class A LOCAL/REGIONAL CDL Driver (Le...,7406024714


In [74]:
id = 1
seen = {}

for idx, row in df.iterrows():
    if row.category == 'resumes / job wanted':
        df.at[idx, 'category'] = 0
    else:
        if row.category in seen:
            df.at[idx, 'category'] = seen[row.category]
        else:
            df.at[idx, 'category'] = id
            seen[row.category] = id
            id += 1

In [75]:
seen

{'general labor': 1,
 'customer service': 2,
 'transportation': 3,
 'manufacturing': 4,
 'food/beverage/hospitality': 5,
 'skilled trades/artisan': 6,
 'et cetera': 7,
 'nonprofit': 8,
 'healthcare': 9,
 'admin/office': 10,
 'architect/engineer/cad': 11,
 'sales': 12,
 'technical support': 13,
 'legal/paralegal': 14,
 'real estate': 15,
 'accounting/finance': 16,
 'business/mgmt': 17,
 'education/teaching': 18,
 'retail/wholesale': 19,
 'human resource': 20,
 'software/qa/dba/etc': 21,
 'salon/spa/fitness': 22,
 'security': 23,
 'art/media/design': 24,
 'marketing/advertising/pr': 25,
 'web/html/info design': 26,
 'systems/networking': 27,
 'writing/editing': 28}

In [76]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [77]:
job_list = df[df["category"] != 0]
resume_list = df[df["category"] == 0]

In [78]:
processed_collection_r = []
processed_collection_j = []
lemmatizer = nltk.stem.WordNetLemmatizer()

for post in resume_list.text:
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    processed_collection_r.append(joins)

for post in job_list.text:
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    processed_collection_j.append(joins)
    

In [97]:
X = processed_collection_j
y = list(job_list.category)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

In [98]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
# vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
vectorizer.fit(x_train)
x_train_m = vectorizer.transform(x_train)
x_test_m = vectorizer.transform(x_test)

In [99]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()

nb.fit(x_train_m, y_train)
y_pred_nb = nb.predict(x_test_m)
acc_nb = accuracy_score(y_test, y_pred_nb)
print("Accuracy: {}%".format(round(acc_nb*100,2)))

Accuracy: 58.81%


In [100]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# svm = make_pipeline(StandardScaler(with_mean=False), LinearSVC(random_state=123))
svm = LinearSVC(random_state=123, max_iter=2000)

svm.fit(x_train_m, y_train)
y_pred_svm = svm.predict(x_test_m)
acc_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy: {}%".format(round(acc_svm*100, 2)))

SVM Accuracy: 78.93%


In [108]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import cross_val_score

print(classification_report(y_test, y_pred_svm))
scores = [~np.isnan(cross_val_score(svm, X, y, cv=2))]
print("CV Accuracy: {}%".format(round(scores[~np.isnan(scores)].mean()*100, 2)))

# cm = confusion_matrix(y_test, y_pred_svm)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)#, display_labels=svm.classes_)
# plt.rcParams['figure.figsize']=(400,400)
# disp.plot()
# plt.show()

              precision    recall  f1-score   support

           1       0.71      0.66      0.68       148
           2       0.21      0.18      0.19        17
           3       0.96      0.95      0.95       327
           4       0.69      0.48      0.56        23
           5       0.80      0.87      0.83       134
           6       0.56      0.76      0.65        46
           7       0.55      0.84      0.67        32
           8       1.00      0.50      0.67         2
           9       0.81      0.72      0.76        18
          10       0.59      0.67      0.62        15
          11       0.50      1.00      0.67         1
          12       0.65      0.65      0.65        20
          13       0.00      0.00      0.00         2
          14       0.83      0.71      0.77         7
          15       0.60      0.60      0.60         5
          16       1.00      0.67      0.80         6
          17       0.00      0.00      0.00         4
          18       0.75    

  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/anaconda3/lib/python3.7/site-packages/sklearn/svm/_classes.py", line 229, in fit
    accept_large_sparse=False)
  File "/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 802, in check_X_y
    estimator=estimator)
  File "/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 598, in check_array
    array = np.asarr

In [83]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=75, bootstrap=True, random_state=123)

rf.fit(x_train_m, y_train)
y_pred_rf = rf.predict(x_test_m)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("RF Accuracy: {}%".format(round(acc_rf*100, 2)))

RF Accuracy: 70.71%


In [84]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(solver='sgd', hidden_layer_sizes=(50), random_state=123, max_iter=500)
nn.fit(x_train_m, y_train)
y_pred_nn = nn.predict(x_test_m)
acc_nn = accuracy_score(y_test, y_pred_nn)
print("NN Accuracy: {}%".format(round(acc_nn*100, 2)))

NN Accuracy: 43.45%




In [85]:
resume_vec = vectorizer.transform(processed_collection_r)
y_resume = svm.predict(resume_vec)

In [86]:
categories = {v: k for k, v in seen.items()}

In [87]:
resume_list['pred_category'] = y_resume
resume_list.pred_category = resume_list.pred_category.astype('object')
idx_v = {}
for idx, row in resume_list.iterrows():
    resume_list.at[idx, 'pred_category_text'] = categories[row.pred_category]
    post = row.text
    id = row.postid
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    v = vectorizer.transform([joins])
    if row.pred_category in idx_v:
        idx_v[row.pred_category].append((v,id))
    else:
        idx_v[row.pred_category] = [(v,id)]
        
jobs_idx_v = {}
for idx, row in job_list.iterrows():
    post = row.text
    id = row.postid
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    v = vectorizer.transform([joins])
    if row.category in jobs_idx_v:
        jobs_idx_v[row.category].append((v,id))
    else:
        jobs_idx_v[row.category] = [(v,id)]
    
resume_list.to_csv("predicted_categories.csv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_

KeyboardInterrupt: 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
first_job = x_train_m[0]
first_job_y = y_train[0]
pred_resumes = pd.read_csv("predicted_categories.csv")


In [None]:
from sklearn.metrics import jaccard_score

jobs_matching_resumes = {}

for job_category in jobs_idx_v:
    jobs = jobs_idx_v[job_category][:3]
    for job_vtext, job_id in jobs:
        v = job_vtext.toarray()
        if job_category in idx_v:
            v_text = idx_v[job_category]
            top = None
            sims = []
            for text,id in v_text:
                sim = cosine_similarity([v[0,:]],[text.toarray()[0,:]])
#                 sim = jaccard_score(v[0,:],text.toarray()[0,:], average='weighted')
#                 print(sim)
                sims.append((sim,id))

            sims.sort(key=lambda x: x[0], reverse=True)
            jobs_matching_resumes[job_id] = sims[:3]
        

In [None]:
f = open('jobs_top_resumes.csv', 'w')
f.write('job_id,job_text,res1_id,res1_text,res2_id,res2_text,res3_id,res3_text\n')
for job_id in jobs_matching_resumes:
    job_text = list(df[df['postid'] == job_id].text)[0].replace(',','')
    f.write("{},{}".format(job_id, job_text))
    for resume in jobs_matching_resumes[job_id]:
        if resume[0] > 0.04:
            resume_id = resume[1]
            resume_text = list(df[df['postid'] == resume_id].text)[0].replace(',','')
            f.write(",{},{}".format(resume_id,resume_text))
    f.write('\n')
f.close()

In [None]:
resumes_matching_jobs = {}

for category in idx_v:
    potential_list = jobs_idx_v[category]
    for resume_vtext, resume_id in idx_v[category]:
        resume_vtext = resume_vtext.toarray()
        
        sims = []
        for job_vtext, job_id in potential_list:
            job_vtext = job_vtext.toarray()
            
            sim = cosine_similarity([resume_vtext[0,:]],[job_vtext[0,:]])
            sims.append((sim[0][0],job_id))
            
        sims.sort(key=lambda x: x[0], reverse=True)
        resumes_matching_jobs[resume_id] = sims[:3]



In [None]:
f = open('resume_top_jobs.csv', 'w')
f.write('res_id,res_text,job1_id,job1_text,job2_id,job2_text,job3_id,job3_text\n')
for res_id in resumes_matching_jobs:
    res_text = list(df[df['postid'] == res_id].text)[0].replace(',','')
    f.write("{},{}".format(res_id, res_text))
    for job in resumes_matching_jobs[res_id]:
        if job[0] > 0.04:
            job_id = job[1]
            job_text = list(df[df['postid'] == job_id].text)[0].replace(',','')
            f.write(",{},{}".format(job_id,job_text))
    f.write('\n')
f.close()