In [9]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [2]:
df = pd.read_csv('tech_jobs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   advertiserurl             22000 non-null  object
 1   company                   21950 non-null  object
 2   employmenttype_jobstatus  21770 non-null  object
 3   jobdescription            22000 non-null  object
 4   jobid                     22000 non-null  object
 5   joblocation_address       21997 non-null  object
 6   jobtitle                  22000 non-null  object
 7   postdate                  22000 non-null  object
 8   shift                     21643 non-null  object
 9   site_name                 3490 non-null   object
 10  skills                    21957 non-null  object
 11  uniq_id                   22000 non-null  object
dtypes: object(12)
memory usage: 2.0+ MB


In [4]:
df.isna().sum()

advertiserurl                   0
company                        50
employmenttype_jobstatus      230
jobdescription                  0
jobid                           0
joblocation_address             3
jobtitle                        0
postdate                        0
shift                         357
site_name                   18510
skills                         43
uniq_id                         0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isna().sum()

advertiserurl               0
company                     0
employmenttype_jobstatus    0
jobdescription              0
jobid                       0
joblocation_address         0
jobtitle                    0
postdate                    0
shift                       0
site_name                   0
skills                      0
uniq_id                     0
dtype: int64

In [7]:
df['jobdescription'].head()

96     Responsible for managing one or more highly co...
97     Job Description: Seeking a Business Process An...
98     TAD PGS, INC. is currently seeking a DHMSM Ope...
99     Our client, one of the world's leading profess...
100    Our client, one of the largest banking and fin...
Name: jobdescription, dtype: object

In [10]:
tfidf = TfidfVectorizer()
df['jobdescription'] = df['jobdescription'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['jobdescription'])
tfidf_matrix.shape

(3417, 34365)

In [11]:
cosine_sim = sigmoid_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df.index, index= df['jobtitle']).drop_duplicates()

In [12]:
def get_recommendation(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    tech_indices = [i[0] for i in sim_scores]
    return df['jobtitle'].iloc[tech_indices]

In [17]:
get_recommendation('Information Security Engineer')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [13]:
new_df = df[['jobtitle', 'jobdescription']]

In [14]:
new_df.to_csv('new_tech_jobs.csv')

In [16]:
pickle.dump(new_df,open('job_list.pkl','wb'))
pickle.dump(cosine_sim,open('similarity_new.pkl','wb'))