In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("../csv/data.csv", header=None)
df.columns = ('category', 'text', 'postid')

In [2]:
df.head()

Unnamed: 0,category,text,postid
0,general labor,FERRARA CANDY *** PACKAGING ASSISTANTS *** Dek...,7405032484
1,customer service,Office Customer Service Part - Time 24 hrs/wk ...,7405099747
2,resumes / job wanted,quality tech control (west burbs) Manufacturin...,7396174532
3,resumes / job wanted,Driver Job Wanted (Chicago) CDL-Class-B Lookin...,7400075704
4,transportation,$400/DAY Class A LOCAL/REGIONAL CDL Driver (Le...,7406024714


In [3]:
id = 1
seen = {}

for idx, row in df.iterrows():
    if row.category == 'resumes / job wanted':
        df.at[idx, 'category'] = 0
    else:
        if row.category in seen:
            df.at[idx, 'category'] = seen[row.category]
        else:
            df.at[idx, 'category'] = id
            seen[row.category] = id
            id += 1

In [4]:
seen

{'general labor': 1,
 'customer service': 2,
 'transportation': 3,
 'manufacturing': 4,
 'food/beverage/hospitality': 5,
 'skilled trades/artisan': 6,
 'et cetera': 7,
 'nonprofit': 8,
 'healthcare': 9,
 'admin/office': 10,
 'architect/engineer/cad': 11,
 'sales': 12,
 'technical support': 13,
 'legal/paralegal': 14,
 'real estate': 15,
 'accounting/finance': 16,
 'business/mgmt': 17,
 'education/teaching': 18,
 'retail/wholesale': 19,
 'human resource': 20,
 'software/qa/dba/etc': 21,
 'salon/spa/fitness': 22,
 'security': 23,
 'art/media/design': 24,
 'marketing/advertising/pr': 25,
 'web/html/info design': 26,
 'systems/networking': 27,
 'writing/editing': 28}

In [43]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [6]:
job_list = df[df["category"] != 0]
resume_list = df[df["category"] == 0]

In [7]:
processed_collection_r = []
processed_collection_j = []
lemmatizer = nltk.stem.WordNetLemmatizer()

for post in resume_list.text:
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    processed_collection_r.append(joins)

for post in job_list.text:
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    processed_collection_j.append(joins)
    

In [8]:
X = processed_collection_j
y = list(job_list.category)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [44]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
# vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
vectorizer.fit(x_train)
x_train_m = vectorizer.transform(x_train)
x_test_m = vectorizer.transform(x_test)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()

nb.fit(x_train_m, y_train)
y_pred_nb = nb.predict(x_test_m)
acc_nb = accuracy_score(y_test, y_pred_nb)
print("Accuracy: {}%".format(round(acc_nb*100,2)))

Accuracy: 57.86%


In [None]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# svm = make_pipeline(StandardScaler(with_mean=False), LinearSVC(random_state=123))
svm = LinearSVC(random_state=123, max_iter=2000)

svm.fit(x_train_m, y_train)
y_pred_svm = svm.predict(x_test_m)
acc_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy: {}%".format(round(acc_svm*100, 2)))

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=75, bootstrap=True, random_state=123)

rf.fit(x_train_m, y_train)
y_pred_rf = rf.predict(x_test_m)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("RF Accuracy: {}%".format(round(acc_rf*100, 2)))

RF Accuracy: 70.71%


In [67]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(solver='sgd', hidden_layer_sizes=(50), random_state=123)
nn.fit(x_train_m, y_train)
y_pred_nn = nn.predict(x_test_m)
acc_nn = accuracy_score(y_test, y_pred_nn)
print("NN Accuracy: {}%".format(round(acc_nn*100, 2)))

NN Accuracy: 77.38%




In [46]:
resume_vec = vectorizer.transform(processed_collection_r)
y_resume = svm.predict(resume_vec)

In [47]:
categories = {v: k for k, v in seen.items()}

In [48]:
resume_list['pred_category'] = y_resume
resume_list.pred_category = resume_list.pred_category.astype('object')
idx_v = {}
for idx, row in resume_list.iterrows():
    resume_list.at[idx, 'pred_category_text'] = categories[row.pred_category]
    post = row.text
    id = row.postid
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    v = vectorizer.transform([joins])
    if row.pred_category in idx_v:
        idx_v[row.pred_category].append((v,id))
    else:
        idx_v[row.pred_category] = [(v,id)]
        
jobs_idx_v = {}
for idx, row in job_list.iterrows():
    post = row.text
    id = row.postid
    tokens = nltk.word_tokenize(post)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english') if token.isalpha()]
    joins = " ".join(tokens)
    v = vectorizer.transform([joins])
    if row.category in jobs_idx_v:
        jobs_idx_v[row.category].append((v,id))
    else:
        jobs_idx_v[row.category] = [(v,id)]
    
resume_list.to_csv("predicted_categories.csv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [49]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
first_job = x_train_m[0]
first_job_y = y_train[0]
pred_resumes = pd.read_csv("predicted_categories.csv")


In [62]:
from sklearn.metrics import jaccard_score

jobs_matching_resumes = {}

for job_category in jobs_idx_v:
    jobs = jobs_idx_v[job_category][:3]
    for job_vtext, job_id in jobs:
        v = job_vtext.toarray()
        if job_category in idx_v:
            v_text = idx_v[job_category]
            top = None
            sims = []
            for text,id in v_text:
                sim = cosine_similarity([v[0,:]],[text.toarray()[0,:]])
#                 sim = jaccard_score(v[0,:],text.toarray()[0,:], average='weighted')
#                 print(sim)
                sims.append((sim,id))

            sims.sort(key=lambda x: x[0], reverse=True)
            jobs_matching_resumes[job_id] = sims[:3]
        

0.9975265435261506
0.9976743418004325
0.9973747156881437
0.9976428497841948
0.9975751236820474
0.997323930709795
0.9973997252643003
0.9976262035235667
0.9973143339633967
0.9972407928494514
0.9973510385986425
0.997496638065558
0.9976049290675949
0.996799027706975
0.9972199269350499
0.9971576752893685
0.9973271275279247
0.9975265435261506
0.9975223805503406
0.9972573782644238
0.9972174708893838
0.997307189957281
0.9968789048855116
0.9971940308394345
0.9972075187185384
0.9974742149304224
0.9973470955415568
0.9975831715042603
0.997323930709795
0.9974403525976462
0.9974403525976462
0.9973941374529969
0.9975709599965645
0.9975448151331086
0.9976133799058291
0.9973446513631999
0.9975113830971521
0.9973339591701089
0.9972907155610488
0.9972738532670756
0.997089776745283
0.9968391642331776
0.997566404441215
0.9973440980979065
0.9974343686403331
0.9956359578945968
0.9976853537479748
0.9975165750392864
0.9974667326049654
0.9959393982112519
0.9971803698195907
0.9973842235945634
0.9975145599433528


In [61]:
f = open('jobs_top_resumes.csv', 'w')
f.write('job_id,job_text,res1_id,res1_text,res2_id,res2_text,res3_id,res3_text\n')
for job_id in jobs_matching_resumes:
    job_text = list(df[df['postid'] == job_id].text)[0].replace(',','')
    f.write("{},{}".format(job_id, job_text))
    for resume in jobs_matching_resumes[job_id]:
        if resume[0] > 0.04:
            resume_id = resume[1]
            resume_text = list(df[df['postid'] == resume_id].text)[0].replace(',','')
            f.write(",{},{}".format(resume_id,resume_text))
    f.write('\n')
f.close()

In [31]:
resumes_matching_jobs = {}

for category in idx_v:
    potential_list = jobs_idx_v[category]
    for resume_vtext, resume_id in idx_v[category]:
        resume_vtext = resume_vtext.toarray()
        
        sims = []
        for job_vtext, job_id in potential_list:
            job_vtext = job_vtext.toarray()
            
            sim = cosine_similarity([resume_vtext[0,:]],[job_vtext[0,:]])
            sims.append((sim[0][0],job_id))
            
        sims.sort(key=lambda x: x[0], reverse=True)
        resumes_matching_jobs[resume_id] = sims[:3]



In [33]:
f = open('resume_top_jobs.csv', 'w')
f.write('res_id,res_text,job1_id,job1_text,job2_id,job2_text,job3_id,job3_text\n')
for res_id in resumes_matching_jobs:
    res_text = list(df[df['postid'] == res_id].text)[0].replace(',','')
    f.write("{},{}".format(res_id, res_text))
    for job in resumes_matching_jobs[res_id]:
        if job[0] > 0.04:
            job_id = job[1]
            job_text = list(df[df['postid'] == job_id].text)[0].replace(',','')
            f.write(",{},{}".format(job_id,job_text))
    f.write('\n')
f.close()