In [12]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# Load data
researchers_df = pd.read_csv('C:/Users/sarah/Downloads/researchers.csv')
articles_df = pd.read_csv('C:/Users/sarah/Downloads/articles.csv')

# Preprocess data: combine areas of expertise into a single string
researchers_df['Combined_Areas'] = researchers_df['Areas'].apply(lambda x: ' '.join(x.split(', ')))

# Load pre-trained Sentence-BERT model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Encode researchers' areas of expertise and article abstracts
areas_embeddings = model.encode(researchers_df['Combined_Areas'].tolist(), convert_to_tensor=True)
abstracts_embeddings = model.encode(articles_df['ABSTRACT TEXT'].tolist(), convert_to_tensor=True)

# Compute cosine similarity between researchers' areas and article abstracts
similarity_matrix = util.pytorch_cos_sim(areas_embeddings, abstracts_embeddings)

# Create a dictionary to store matched articles for each researcher
matches = []






  from tqdm.autonotebook import tqdm, trange


In [13]:
# For each researcher, find the most relevant articles based on similarity
for i, researcher in researchers_df.iterrows():
    relevant_indices = similarity_matrix[i].argsort(descending=True)[0].item() # Get indices of top 5 matching articles
    matched_article = articles.iloc[relevant_indices]
    matches.append({'Name': researcher['Name '],
        'Contact': researcher['Contact'],
        'Article Title': matched_article['ARTICLE TITLE'],
        'Article DOI': matched_article['DOI'],
        'Article Abstract': matched_article['ABSTRACT TEXT']})

In [14]:
# Convert to DataFrame
matches_df = pd.DataFrame(matches)
print(matches_df)

                  Name                               Contact  \
0    Stephanie Leiser                     schmidts@umich.edu   
1     Samuel Bagenstos                    sambagen@umich.edu   
2         Michael Barr                      msbarr@umich.edu   
3         Jenna Bednar                     jbednar@umich.edu   
4       Kamissa Camara                     kcamara@umich.edu   
..                 ...                                   ...   
596         Junwei Xia                    jxia@mays.tamu.edu   
597   Kathleen Herbohn          k.herbohn@business.uq.edu.au   
598     John Gallemore  john_gallemore@kenan-flagler.unc.edu   
599       Edward Mayde                 Edward_Maydew@unc.edu   
600    Richard Agbanyo       agbanyo.richard@upsamail.edu.gh   

                                         Article Title  \
0    Determinants of Debt Concentration at the Stat...   
1    Preserving the Public Interest in Highway Publ...   
2    Determinants of Debt Concentration at the Stat...   

In [15]:
matches_df.to_csv('C:/Users/sarah/Downloads/algo2.csv', index=False)

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import tensorflow_hub as hub

# Load datasets
researchers = pd.read_csv('C:/Users/sarah/Downloads/researchers.csv')
articles = pd.read_csv('C:/Users/sarah/Downloads/articles.csv')

# Preprocess data
researchers['Areas'] = researchers['Areas'].str.lower()
articles['ABSTRACT TEXT'] = articles['ABSTRACT TEXT'].astype(str).str.lower()

researchers['Combined_Areas'] = researchers['Areas'].apply(lambda x: ' '.join(x.split(', ')))


# Load Universal Sentence Encoder
tfhub_handle_encoder = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.load(tfhub_handle_encoder)

# Compute embeddings 
areas_embeddings = embed((researchers['Combined_Areas'].tolist()))
article_abstracts = articles['ABSTRACT TEXT'].tolist()
article_embeddings = embed((article_abstracts))

# TF-IDF Vectorization for researchers' areas
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(researchers['Areas'])
article_vectors = tfidf_vectorizer.transform(articles['ABSTRACT TEXT'])

# Calculate cosine similarity between article embeddings and TF-IDF matrix
similarity_matrix = cosine_similarity( tfidf_matrix,article_vectors)

match = similarity_matrix.argmax(axis=1)
matches = []
# For each researcher, find the most relevant articles based on similarity
for i, researcher in researchers.iterrows():
    matched_article = articles.iloc[match[i]]
    matches.append({'Name': researcher['Name '],
        'Contact': researcher['Contact'],
        'Article Title': matched_article['ARTICLE TITLE'],
        'Article DOI': matched_article['DOI'],
        'Article Abstract': matched_article['ABSTRACT TEXT']})


In [9]:
# Convert to DataFrame
df = pd.DataFrame(matches)
print(df)

                  Name                               Contact  \
0    Stephanie Leiser                     schmidts@umich.edu   
1     Samuel Bagenstos                    sambagen@umich.edu   
2         Michael Barr                      msbarr@umich.edu   
3         Jenna Bednar                     jbednar@umich.edu   
4       Kamissa Camara                     kcamara@umich.edu   
..                 ...                                   ...   
596         Junwei Xia                    jxia@mays.tamu.edu   
597   Kathleen Herbohn          k.herbohn@business.uq.edu.au   
598     John Gallemore  john_gallemore@kenan-flagler.unc.edu   
599       Edward Mayde                 Edward_Maydew@unc.edu   
600    Richard Agbanyo       agbanyo.richard@upsamail.edu.gh   

                                         Article Title  \
0       The Political Economy of Land Finance in China   
1    Why Lash Yourself to the Mast? The Case of the...   
2    Measuring the Financial Position of Municipali...   

In [10]:
df.to_csv('C:/Users/sarah/Downloads/algo3.csv', index=False)