[ ] Campo vs campo

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import scipy

In [2]:
def read_file():
    df = pd.read_csv("../../../data/processed/politicas_lema.csv")
    df['index'] = range(len(df))
    return df

def bow(df):

    df['Bag_of_words'] = ''
    columns = ['problema_lema', 'solucion_lema', 'meta_lema', 'resultado_lema']

    list_words = []
    for index, row in df.iterrows():
        words = ''
        for col in columns:
            words += words + ' ' + row[col]
        list_words.append(words)
    df['Bag_of_words'] = list_words

    return df[['index', 'titulo_caso','Bag_of_words', 'problema_lema', 'solucion_lema', 'meta_lema', 'resultado_lema', 'resumen_lema']]

def compute_vectors(df, target):
    count_binary = CountVectorizer(binary=True)
    count_binary_matrix = count_binary.fit_transform(df[target])
    
    count = CountVectorizer()
    count_matrix = count.fit_transform(df[target])

    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df[target])
    
    tfidf_uni_bi = TfidfVectorizer(ngram_range=(1,2))
    tfidf_matrix_uni_bi = tfidf.fit_transform(df[target])
    
    vect = [count_binary_matrix, count_matrix, tfidf_matrix, tfidf_matrix_uni_bi]
    
    return vect

def recommend(df, vect, title, index_var, title_list, metric = 'cosine_sim', alpha = 1, k_primeros=3):
    recommended = []
    indices = pd.Series(df[index_var])
    idx = indices[indices == title].index[0]
    
    matrix = []
    similarity_threshold = 0
    if metric == 'cosine_sim':
        matrix = cosine_similarity(vect, vect)
        matrix_arr = matrix[idx]
        matrix_arr = np.delete(matrix_arr, idx)
        similarity_threshold = np.median(matrix_arr) + alpha * matrix_arr.std()
        
    elif metric == 'euclidean':
        matrix = pairwise_distances(vect, vect, metric='euclidean')
    elif metric == 'manhattan':
        matrix = pairwise_distances(vect, vect, metric='manhattan')
    elif metric == 'jaccard':
        matrix = pairwise_distances(vect.todense(), vect.todense(), metric='jaccard')
    elif metric == 'hamming':
        matrix = pairwise_distances(vect.todense(), vect.todense(), metric='hamming')
    
    score_series = pd.Series(matrix[idx]).sort_values(ascending = False)
    score_series = score_series.drop(idx)
    
    top_10_indices = list(score_series.iloc[0:k_primeros].index)
    
    for i in top_10_indices:
        if score_series[i] >= similarity_threshold:
            recommended.append(title_list[i])
        
    return recommended, list(score_series[0:k_primeros])

def process_recommendations(df, vect_ref, metric, index_var, alpha, k_primeros):

    list_policy_index = []
    list_policy = []
    list_recommendation_index = []
    list_recommendation = []
    list_score = []
    
    for policy in df[index_var]:
        #policy = df['titulo_caso'][9]
        print(f"TOP 10 RECOMMENDATIONS FOR POLICY: [{str(policy[:80])}...]:")
        i = 1
        for recommendation, score in zip(*recommend(df, vect_ref, policy, index_var, list(df[index_var]), metric, alpha, k_primeros)):
            list_policy_index.append(df.loc[df['titulo_caso'] == policy]['index'].values[0])
            list_policy.append(policy)
            list_recommendation_index.append(df.loc[df['titulo_caso'] == recommendation]['index'].values[0])
            list_recommendation.append(recommendation)
            list_score.append(score)
            print(f"\t[{i:<2}] Title: {recommendation[:30]:<30}... Similarity Score: {str(round(score, 8)):<12}")
            i = i+1
        print("\n")
        
    rec_df = pd.DataFrame()
    rec_df['policy_index'] = list_policy_index
    rec_df['policy'] = list_policy
    rec_df['recommendation_index'] = list_recommendation_index
    rec_df['recommendation'] = list_recommendation
    rec_df['score'] = list_score
    
    return rec_df

<p></p>
<div>
<img src="quarts.png" width="500"/>
</div>

In [3]:
df = read_file()
df = bow(df)

target_list = ['problema_lema', 'solucion_lema', 'meta_lema', 'resultado_lema', 'Bag_of_words', 'resumen_lema']
target = target_list[5]
vect = compute_vectors(df, target)

vect_ref = vect[3]
metric = 'cosine_sim'
index_var = 'titulo_caso'
alpha = 0.6745
rec_df = process_recommendations(df, vect_ref, metric, index_var, alpha, k_primeros=3)
rec_df.to_csv(f"../../../data/model_ready/recommendations_{target[:-5]}.csv", index=False, encoding='utf8')

TOP 10 RECOMMENDATIONS FOR POLICY: [Juan Manuel Carreras anuncia paquete por más de 3200 mdp para SLP ante contingen...]:
	[1 ] Title: El Programa de Apoyo Económico... Similarity Score: 0.18467429  
	[2 ] Title: Coronavirus: Senado aprueba pr... Similarity Score: 0.13240626  
	[3 ] Title: Programa de Seguridad Alimenta... Similarity Score: 0.10484993  


TOP 10 RECOMMENDATIONS FOR POLICY: [Uber Eats apoya a las mipymes de Aguascalientes...]:
	[1 ] Title: https://www.ocregister.com/202... Similarity Score: 0.17746169  
	[2 ] Title: ¿Qué actividades se han suspen... Similarity Score: 0.15131481  
	[3 ] Title: Jalisco ordena el cierre de pl... Similarity Score: 0.10192467  


TOP 10 RECOMMENDATIONS FOR POLICY: [Control and Prevention for Solid Waste Management Workers and Employers...]:
	[1 ] Title: ¿Qué actividades se han suspen... Similarity Score: 0.29111844  
	[2 ] Title: Intensifica Salum medidas cont... Similarity Score: 0.14650888  
	[3 ] Title: Aprueba Cabildo de Oaxaca de J... S