In [289]:
import numpy as np
import pandas as pd
import ftfy as fy
import nltk
import string
import py_stringsimjoin as ssj
import py_stringmatching as sm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
from langdetect import detect

# Check title and news data (Ivan's addition)
Here we explore the filtered titles and news articles. Primarily we check for encoding issues.

Load the pre-filtered data

In [290]:
df_clean_test = pd.read_csv('./data/usnewspaper-Jun-Aug_cleaned.csv')

In [291]:
df_clean_test.head()

Unnamed: 0,publishdate,src,title,news
0,2020-06-01,https://www.prnewswire.com/news-releases/,a natural partnership: popsockets & burt`s bees collaborate on a new product,popgrip lips x burt`s bees will provide consumers with a two-in-one product that features the cl...
1,2020-06-01,http://www.reuters.com/,keep your distance: people queue for school and ikea in england,"warrington, england (reuters) - thousands of people across england queued up for school and ikea..."
2,2020-06-01,http://www.aljazeera.com/,mapping us cities where george floyd protests have erupted,"demonstrations have erupted in dozens of us cities after george floyd, an unarmed black man, die..."
3,2020-06-01,https://www.dallasnews.com/,enjoy the dallas museum of art’s exploration of home from the comfort of yours,"what does it mean for a house to be a home?\n\nfor one thing, it means that a structure built of..."
4,2020-06-01,https://www.dallasnews.com/,"target opens some dallas stores as protesters move overnight, causing damage to west end and upt...",the protests that have spread around the country are closing stores at a time when retailers are...


## Remove empty NaN rows

In [292]:
df_clean_test.isna().sum()
df_clean_test = df_clean_test.dropna()
df_clean_test.isna().sum()

publishdate    0
src            0
title          0
news           0
dtype: int64

Replace line breaks with empty spaces from news and titles if they exist.

In [293]:
df_clean_test['news'] = df_clean_test['news'].str.replace('\n\n',' ')
df_clean_test['title'] = df_clean_test['title'].str.replace('\n\n',' ')

## Remove non english articles (function)

In [294]:
def detect_language(article):
    language = detect(article)
    return language

## Import Drug Name List
Here we will use the "common name" or "generic name" to make comparisons"

In [321]:
drug_names = pd.read_csv('./data/cleaned_files/drug_names_additional_column.csv')
drug_names.head(5)

Unnamed: 0,name,common_name,name_type,full_common_name
0,Apache,fentanyl,street_name,fentanyl
1,Birria,fentanyl,street_name,fentanyl
2,Blonde,fentanyl,street_name,fentanyl
3,Blue Diamond,fentanyl,street_name,fentanyl
4,Blue Dolphin,fentanyl,street_name,fentanyl


## Prepare Data for Join

In [322]:
# Use only the common_name column for the dataframe, can be expanded to keep the rest of the columns.
drug_names = pd.DataFrame(set(drug_names['common_name']),columns=['common_name'])

# Generate keys
drug_names['keyleft'] = list(range(len(drug_names['common_name'])))
df_clean_test['keyright'] = list(range(len(df_clean_test['news'])))

## Execute the Join - (Change method and threshold here)

In [323]:
# Choose join method
JCjoin = ssj.join.cosine_join.cosine_join
# Create the alphabetic tokenizer
alphabet_tok_set = sm.AlphabeticTokenizer(return_set=True)
# Execute the Join
match_B = JCjoin(drug_names, df_clean_test, 'keyleft', 'keyright', 'common_name', 'news', alphabet_tok_set, 0.05, 
                                l_out_attrs=['common_name'], r_out_attrs=['publishdate','src','title','news'], n_jobs=-1);

## Sort by similarity coefficient value and also remove non-english articles based on the title (faster than looking at the whole article)

In [324]:
sorted_df = match_B.sort_values(by=['_sim_score'])
sorted_df = sorted_df[sorted_df['r_title'].map(detect_language)=='en']

## Export to CSV and Excel format. It may not work if size is too large

In [325]:
sorted_df.to_excel('matched_output.xlsx',index=False)

In [326]:
sorted_df.to_csv('matched_output.csv',index=False)