In [2]:
import pandas as pd
import fasttext as ft
import faiss
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/CS/pvp0001/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/CS/pvp0001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
dir_name = "/raid/Praveen_Stuff/Eluvio_DS_Challenge.csv"

In [4]:
df = pd.read_csv(dir_name)
ids = list()
length = df.shape[0]
x = 1
for i in range(length):
    ids.append(x)
    x+=1
df['unique_id'] = ids

In [5]:
ids_path = "idx_processed_appended.txt"
titles_path = "title_processed_appended.txt"

In [6]:
# Train fastText Model
embedding_dim = 100
window_size = 5
min_word_count = 5
EPOCH = 25
model_name = "ft_model_em100_ws5_mwc5_ep25_preprocessed_appended.bin"
# Uncomment,if you want to train the fast text model
# ft_model = ft.train_unsupervised(input=titles_path, dim=embedding_dim, epoch=EPOCH, model="skipgram")
# Uncomment,if you want to save the fast text model
# ft_model.save_model(model_name)

In [7]:
#Load the fastText model
ft_model = ft.load_model(model_name)



In [8]:
#Get the title and other information which was stored in text file after preprocessing
desc_file = open(titles_path, 'r')
id_file = open(ids_path, 'r')
desc_Lines = desc_file.readlines()
id_lines = id_file.readlines()

In [9]:
#Create the embeddings using fastText trained model
job_id_list = list()
embedding_list = list()
i = 0
for job_id,job_desc in zip(id_lines,desc_Lines):
    embedding = ft_model.get_sentence_vector(job_desc.rstrip())
    #embedding_dict[int(job_id)] = embedding
    job_id_list.append(int(job_id))
    embedding_list.append(embedding)
    i+=1

In [10]:
# Converting to numpy arrays for easy of use
embedding_list = np.asarray(embedding_list)

In [22]:
def recommend(job_ids, embeddings, target_embedding, num_recs, vec_dim):
    '''
    Uses facebook FAISS library to perform L2 similarity search with 100-d vector embeddings
    
    job_ids: list of jobs_ids, whose indices correspond to embedding indices
    embeddings: numpy array of 100-d vectors, whose indices correspond to the indices of job_ids
    target_ids: list of target ids
    num_recs: the number of recommendations to make
    
    returns: job dictionary with key = job id and value = embedding
    '''
    target_embeddings = list()
    target_embeddings.append(target_embedding)
    print(target_embeddings)
    target_embeddings = np.asarray(target_embeddings)
    index = faiss.IndexFlatL2(vec_dim)
    index.add(embeddings)
    D, I = index.search(target_embeddings, num_recs)
    print("Finding the nearest neighbor of the following query: ")
    similar_jobs = {}
    #index is a list containing indexes corresponding to number of reccomendations requested
    for index in I:
        for element in index:
            #print("\n\nA near neighbor is at index: ")
            #print(element)
            #print("And the element at this index in the db is: ")
            #print(embeddings[element])
            similar_jobs.update({job_ids[element]: embeddings[element]})
            
    print(type(similar_jobs))
    return similar_jobs

In [None]:
# Enter the keywords to give the recommendation
keyword = "Scores killed in Pakistan clashes"

In [23]:
#Tokenize String
tokenized_words = word_tokenize(keyword)
stop_words = set(stopwords.words('english'))
#Remove Stop words
filtered_keyword = list()
for w in tokenized_words:
    if w not in stop_words: 
        filtered_keyword.append(w)
#Stemming
porter = PorterStemmer()
stem_keyword = list()
for word in filtered_keyword:
    stem_keyword.append(porter.stem(word))
    stem_keyword.append(" ")
stem_keyword = "".join(stem_keyword)
# Create the embedding of the vector
target_embedding = ft_model.get_sentence_vector(stem_keyword)
similar_job_embeddings = recommend(job_id_list, embedding_list, target_embedding, 20, 100)

[array([ 0.04412896,  0.01168365, -0.11756157, -0.04418303,  0.00869185,
       -0.05140894, -0.08847402,  0.00497872,  0.07584609,  0.04833292,
       -0.09222653, -0.14014985,  0.08008041, -0.03641176, -0.08736008,
        0.01132878, -0.05632193,  0.04506412,  0.04243913, -0.03035854,
        0.03725669,  0.00509   ,  0.09948729,  0.09680115,  0.06472278,
       -0.08910923, -0.05175205,  0.12667517,  0.00504389,  0.05516104,
       -0.07658587, -0.12319257,  0.0264945 , -0.04150046, -0.00325571,
        0.15404409, -0.03505963, -0.08807554, -0.05926043, -0.00561597,
       -0.08183727,  0.07846905,  0.08858684,  0.04709188,  0.10663982,
        0.03738658,  0.00309123,  0.06106735, -0.12260303, -0.11214973,
        0.00052229,  0.02534869,  0.14247674, -0.04385375, -0.04842799,
       -0.05981544, -0.01091424, -0.03362558, -0.10822722, -0.10855742,
        0.06873065,  0.0324939 , -0.01225883,  0.07589256,  0.11653968,
        0.14609708, -0.03977957,  0.0204637 ,  0.06170549, -0.0

In [31]:
def get_rec_desc(similar_dict, column_name, df):
    '''
    Finds the recommendation based on similar_job_emebeddings 
    
    similar_dict : Similar job embeddings
    column_name : the name of the column in the dataframe where unique id matches
    df: the dataframe of our work
    returns : target dataframe and recommendation dataframe
    '''
    final_df = pd.DataFrame()
    for target in similar_dict.keys():
        target_df = df.loc[df[column_name] == target]
        if(final_df.empty):
            final_df = df.loc[df[column_name] == target]
        else:
            final_df = pd.concat([final_df, df.loc[df[column_name] == target]])
    # Sort based on the number of up_votes
    final_df = final_df.sort_values(by=['up_votes'], ascending=False)        
    return target_df, final_df

In [32]:
target_df, recs_df = get_rec_desc(similar_job_embeddings, "unique_id", df)

In [33]:
print(recs_df)

        time_created date_created  up_votes  down_votes  \
329238    1418793563   2014-12-17       310           0   
98185     1314462605   2011-08-27       143           0   
361851    1429368168   2015-04-18        52           0   
475784    1468068204   2016-07-09        33           0   
289381    1405631866   2014-07-17        22           0   
318927    1414938530   2014-11-02        16           0   
115538    1330435758   2012-02-28        14           0   
82864     1301849212   2011-04-03        13           0   
261697    1394886754   2014-03-15        11           0   
68634     1291631778   2010-12-06         8           0   
47630     1262361411   2010-01-01         7           0   
88084     1306591651   2011-05-28         7           0   
318940    1414943261   2014-11-02         5           0   
35974     1244405962   2009-06-07         5           0   
100452    1316550508   2011-09-20         4           0   
100410    1316531337   2011-09-20         4           0 