# Recommender

In [162]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import DanishStemmer
from textblob import TextBlob
import lemmy
from sklearn.metrics import pairwise_kernels
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sentence_transformers import SentenceTransformer, util
import tensorflow as tf

embedder = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

In [43]:
dataset_name = 'jobindex_cropped_bigger'

In [331]:
base = os.path.abspath('../')

def outname(name):
    return os.path.basename(dataset_name).split('.')[0] + name
    
outname_tfidf = outname('_distances_tfidf.csv')
outname_bert = outname('_distances_bert.csv')
outname_df = outname('_preprocessed_df.csv')

print('Loading TFIDF distances')
df = pd.read_csv(f'{base}/data/processed/{outname_df}', index_col=0)

print('Loading BERT distances')
tfidf = pd.read_csv( f'{base}/data/processed/{outname_tfidf}', index_col=0)

print('Loading Processed Dataframe')
## It is not in Tensor format, but rather numpy array
bert = pd.read_csv(f'{base}/data/processed/{outname_bert}', index_col=0)

print('\n All datasets loaded.')

Loading TFIDF distances
Loading BERT distances
Loading Processed Dataframe

 All datasets loaded.


In [367]:
tfidf = tfidf.reset_index()
bert = bert.reset_index()

In [404]:
## notice the corpus and title_processed
# df['corpus'][1]
df['title_processed'][1]


'ørnered lystrup-elsted dagtilbud søg pædagog børnehav aarhus kommun lystrup 2020-02-27'

In [408]:
def preprocess_text(text):
    # text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = str(text).lower().strip()

    # caveat: this might conflict with the english text
    da_stop_words = stopwords.words('danish')
    stemmer = DanishStemmer()
    lemmatizer = lemmy.load("da")

    # remove plurals
    textblob = TextBlob(text)
    singles = [stemmer.stem(word) for word in textblob.words]

    # remove danish stopwords
    no_stop_words = [word for word in singles if word not in da_stop_words]

    # join text so it can be lemmatized
    joined_text = " ".join(no_stop_words)

    # lemmatization
    final_text = lemmatizer.lemmatize("", joined_text)

    return final_text[0]


## Direct results

In [481]:
def find_direct_results(search_query):
    matching_entries = [df['title_processed'].index[df['title_processed'].str.contains(word, case=False)]
                        .values for word in search_query.split()]
    return list(set(matching_entries[0]).intersection(*matching_entries))

def print_direct_search_results(direct_results):
    print('\nDIRECT RESULTS:', len(direct_results))
    for index, result in enumerate(direct_results):
        print(result, '-',  df['title'][result])
        agg_direct_results_indexes.append(result)

## Baseline model: TFIDF

In [482]:
def recommendations(data, results):
    sorted_results = []
    recommendation_indexes = []
    for result in results:
#         print(result)
        sorted_distances = data[str(result)].sort_values().iteritems()
        
        for item_index, item_value in enumerate(sorted_distances):
            index, distance = item_value
            sorted_results.append({
                'item_index': index,
                'distance': distance,
                'title': df['title'][index]
            })
            
    agg_sorted_indexes = [x['item_index'] for x in sorted_results]
    recommendation_indexes = [x for x in agg_sorted_indexes if x not in results]
        
    return recommendation_indexes


    
# remove direct results from tfidf_results
tdfidf_recommendation_indexes = [x for x in agg_tfidf_indexes if x not in agg_direct_results_indexes]

print ('\n\n BERT')
for recommendation in recommendations(data=bert, results=direct_results)[:k]:
    print(recommendation, df['title'][recommendation])
    
print ('\n\n TFIDF')
for recommendation in recommendations(data=tfidf, results=direct_results)[:k]:
    print(recommendation, df['title'][recommendation])
    




 BERT
9490 Ambulatoriesygeplejerske søges til Affektivt Team, Børne- og Ungdomspsykiatri Odense
3433 Reservelæge søges til en vagtfri stilling i voksenteamet, Psykiatrisk Klinik for Spiseforstyrrelser, Børne- og Ungdomspsykiatrisk Afdeling, Region Midt
2637 Serviceminded sekretær til psykiatrisk akutmodtagelse for børn og voksne på Psykiatrisk Center Glostrup
1948 Specialbørnehjemmet Muslingen søger en omsorgs- og pædagogmedhjælper som fast vågen nattevagt til børn og unge med fysiske og psykiske funktionsnedsættelser
5368 Lægesekretær til Børne- og Ungdomspsykiatrisk Afdeling, Skejby


 TFIDF
5368 Lægesekretær til Børne- og Ungdomspsykiatrisk Afdeling, Skejby
3433 Reservelæge søges til en vagtfri stilling i voksenteamet, Psykiatrisk Klinik for Spiseforstyrrelser, Børne- og Ungdomspsykiatrisk Afdeling, Region Midt
1347 Sygeplejersker - Medicinsk Afdeling - Geriatrisk sengeafsnit, Hospitalsenheden Vest, Herning
1553 Sygeplejerske
6329 Introduktionslægestilling, Anæstesiologisk Afdelin

# Results

In [485]:
# results with some cities are wrong (ex, after preprocessing, Odense becomes Ode)

def display_results(data, name, k):
    print (f'\n\n{name}')
    for index, recommendation in enumerate(recommendations(data=data, results=direct_results)[:k]):
        print(recommendation, df['title'][recommendation])
    

def get_recommendations(query, k):
    direct_results = find_direct_results(preprocess_text(query))
    print_direct_search_results(direct_results)
    
    
    display_results(data=tfidf, name='TFIDF', k=k)
    display_results(data=bert, name='BERT', k=k)
        

get_recommendations(query='Lægesekretær Odense', k=5)



DIRECT RESULTS: 21
4640 - Onkologisk Afdeling R, Odense Universitetshospital OUH - søger lægesekretær
4641 - Onkologisk Afdeling R, Odense Universitetshospital OUH - søger lægesekretær
4642 - Onkologisk Afdeling R, Odense Universitetshospital OUH - søger lægesekretær
4643 - Onkologisk Afdeling R, Odense Universitetshospital OUH - søger lægesekretær
5335 - Lægesekretær til fast stilling - Øjenafdeling E
3544 - Rehabiliteringsafdelingen OUH Svendborg Sygehus søger vikar for lægesekretær
5336 - Lægesekretær til fast stilling - Øjenafdeling E
5338 - Lægesekretær til Urinvejskirurgisk afdeling L
5339 - Lægesekretær til Urinvejskirurgisk afdeling L
5848 - Koordinator/lægesekretær til Psykiatrisk Afdeling Odense, Sengeafsnit P50V og Ø
5849 - Koordinator/lægesekretær til Psykiatrisk Afdeling Odense, Sengeafsnit P50V og Ø
5348 - Lægesekretær til Ortopædkirurgisk Afdeling O - OUH Odense
5349 - Lægesekretær til Ortopædkirurgisk Afdeling O - OUH Odense
5350 - Lægesekretær til Neurokirurgisk Afdel

In [343]:
agg_tfidf_results = []
agg_tfidf_indexes = []
flat_distances = []

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['corpus'])

print(direct_results)

# depends on direct results
for result_index, result in enumerate(direct_results):
    # - performance tweak: instead of pairing the entire matrix,
    # only pair the actual direct result. 
    # - performance tweak: use n_jobs=-1 to use all the CPU cores available.
    distances = pairwise_kernels(X, X[result], n_jobs=-1, metric='cosine')
    flat_distances = np.array(distances).flatten()
    
    for index, distance in enumerate(flat_distances):       
        agg_tfidf_results.append({
        'distance': float(distance),
        'index': index,
        'result_title': df['title'][index],
        'description': df['description'][index]
        })
        
# sort by distance       
sorted_tfidf = sorted(agg_tfidf_results, key=lambda k: k['distance'], reverse=True)[:5]

# print(sorted_tfidf)

# aggregate sorted indexes
# for item in sorted_tfidf:
#     print(item['result_title'])
#     print(item['index'])
#     print(item['distance'])
# #     agg_tfidf_indexes.append(item['index'])

agg_tfidf_indexes = [x['index'] for x in sorted_tfidf]
    
# remove direct results from tfidf_results
tdfidf_recommendation_indexes = [x for x in agg_tfidf_indexes if x not in agg_direct_results_indexes]


for index, value in enumerate(tdfidf_recommendation_indexes):
    print(index, '-',  df['title'][value])
    print('similarity:', flat_distances[value])

# [print(x) for x in sorted_tfidf]

[8717]
0 - Praktikant
similarity: 0.43861383739694293
1 - Frivillig søges til Lærings- og lektiecaféen Sofiendal
similarity: 0.3952099886799837
2 - Lektiecaféen på Blågårdens bibliotek på Nørrebro søger en nye frivillige
similarity: 0.3687287446862976
3 - Ungdomsleder
similarity: 0.36863721091034357


In [486]:
query_embedding_bert = embedder.encode(search_query, convert_to_tensor=True)

import torch
# px = pd.DataFrame(query_embedding_bert.numpy())

# px.head()

stuff_x = torch.save(embedder.encode(df['corpus'], convert_to_tensor=True), 'file2.pt')
stuff_y = torch.load('file2.pt')

util.pytorch_cos_sim(query_embedding_bert, stuff_y)[0]

tensor([0.7842, 0.7597, 0.6783,  ..., 0.6991, 0.7129, 0.8068])

In [488]:
type(stuff_y)
stuff_z = torch.load('file2.pt')

In [206]:
query_embedding_bert = embedder.encode(search_query, convert_to_tensor=False)

util.pytorch_cos_sim(query_embedding_bert, bert.to_numpy())[0]

RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #3 'mat2' in call to _th_addmm_out

In [215]:
title_encodding = embedder.encode(df['title_processed'][:100])

In [223]:
type(title_encodding)

numpy.ndarray

In [236]:
tfidf.reset_index().head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.110223e-16,0.985694,0.99801,0.944209,0.969883,0.926449,0.914511,0.980756,0.968036,0.948265,...,0.945846,0.996605,0.958133,0.988702,0.995991,0.997833,0.997683,0.999298,0.99527,0.973767
1,0.9856935,0.0,0.998954,0.996815,0.985291,0.98257,0.996121,0.934484,0.939784,0.994898,...,0.979247,0.998215,0.99152,0.994136,0.991894,0.999197,0.998782,0.999631,0.990436,0.99378
2,0.9980099,0.998954,0.0,0.998417,0.998179,0.996164,0.998788,0.998062,0.998843,0.998406,...,0.997004,0.995851,0.998667,0.99908,0.997468,0.997032,0.997169,0.999142,0.997013,0.997613
3,0.9442089,0.996815,0.998417,0.0,0.963142,0.875405,0.970257,0.979621,0.925585,0.962968,...,0.959437,0.9973,0.973085,0.996138,0.996988,0.98585,0.998157,0.999442,0.996446,0.981257
4,0.9698833,0.985291,0.998179,0.963142,0.0,0.886379,0.996703,0.991449,0.986418,0.990128,...,0.997757,0.996894,0.999002,0.999311,0.993111,0.997778,0.99788,0.999358,0.991872,0.927339


In [362]:
for item in tfidf['4'].sort_values().head().iteritems():
    print(item)

(0.9859759941119464, 0.0)
(0.972418363557518, 0.7825049292146624)
(0.9873618983766532, 0.8177851578058495)
(0.931070415295759, 0.8201493512996451)
(0.9904168478008406, 0.8249228435040399)


In [268]:
tfidf = tfidf.reset_index()
bert = tfidf.reset_index()

8717
4045
0.0 8711 Call centermedarbejder - 28 timer - primært dag og aften
0.7420769978446451 2445 Social og- sundhedsassistent til D5 Sydvestjysk Sygehus, Esbjerg
0.7485641007115025 4050 Psykiatrisk Afdeling Esbjerg afsnit E, social- og sundhedsassistent, fast stilling.
0.7499849640317027 6735 Handicaphjælper - Esbjerg V
0.7558654560380074 4488 Peer-medarbejder søges til Lokalpsykiatri Varde, Psykiatrisk Afdeling Esbjerg


[None, None, None, None, None]

In [361]:
tfidf_agg_distances = []

print('stuff', tfidf[str(8717)].sort_values())

for item in direct_results:
    print(item)
#     print(df['title'][item])
    for value in tfidf[str(item)].sort_values().iteritems():
        
        # break
        tfidf_agg_distances.append({
            'index:': item,
            'distance': value
        })

print(tfidf_agg_distances[:10])
# for value in  tfidf[str(4045)].sort_values().head().iteritems():
#     ix, dist = value
#     print(df['title'][ix])

stuff 0
0.984755    0.000000
0.991554    0.561386
0.983282    0.604790
0.984402    0.631271
0.992901    0.631363
              ...   
0.989780    0.998822
0.997971    0.998829
0.997983    0.998836
0.998058    0.998880
0.998146    0.998930
Name: 8717, Length: 10000, dtype: float64
1349
3433
1548
7663
7664
5368
[{'index:': 1349, 'distance': (0.9729187908918416, 0.0)}, {'index:': 1349, 'distance': (0.976080194114999, 0.13524105628781802)}, {'index:': 1349, 'distance': (0.994599003592646, 0.20184004721366253)}, {'index:': 1349, 'distance': (0.995500100616644, 0.2649130464782875)}, {'index:': 1349, 'distance': (0.99280050227444, 0.4324744244879083)}, {'index:': 1349, 'distance': (0.931070415295759, 0.5707951257806185)}, {'index:': 1349, 'distance': (0.9855324104392258, 0.5934591765523491)}, {'index:': 1349, 'distance': (0.9700067801112836, 0.6006131795106848)}, {'index:': 1349, 'distance': (0.9688063474315912, 0.6260314475580637)}, {'index:': 1349, 'distance': (0.9873618983766532, 0.6277139

In [249]:
df['title'][4050]
df['title'][6735]
df['title'][4488]

'Peer-medarbejder søges til Lokalpsykiatri Varde, Psykiatrisk Afdeling Esbjerg'

In [264]:
bert.reset_index()['8711'].sort_values(ascending=True)

8711    2.980232e-07
7290    3.115213e-02
7587    3.682375e-02
2998    3.785247e-02
3401    4.086298e-02
            ...     
197     2.356173e-01
4046    2.389286e-01
3753    2.422984e-01
4034    2.478661e-01
5204    2.630115e-01
Name: 8711, Length: 10000, dtype: float64

In [261]:
df['title'][197]

'Western guitar'

[3, 1, 2, 0]

In [439]:
obj = [{'name': 'John', 'age': 45}, {'name': 'Jane', 'age': 39}]

sorted_obj = sorted(obj, key=lambda x: x['age'], reverse=False)

sorted_obj

[{'name': 'Jane', 'age': 39}, {'name': 'John', 'age': 45}]