In [1]:
import numpy as np
import pandas as pd
import ftfy
from langdetect import detect
import os

In [2]:
os.chdir('C:\\Users\\rmartinez4\\Box\\Personal Git\\Drug-Epidemic-Tracking-With-Social-Media')

In [3]:
NEWS = './data/usnewspaper-Jun-Aug.csv'

### read data

In [4]:
df = pd.read_csv(NEWS)

In [5]:
df = df[['publishdate', 'src', 'title', 'news']] # keep these columns only

In [6]:
df.head()

Unnamed: 0,publishdate,src,title,news
0,2020-06-01,https://www.prnewswire.com/news-releases/,A Natural Partnership: PopSockets & Burt`s Bee...,PopGrip Lips x Burt`s Bees will provide consum...
1,2020-06-01,http://www.pionline.com,Continued market volatility contributes to 27....,
2,2020-06-01,http://www.denverpost.com/,"Comments on: Mass gatherings, erosion of trust...",
3,2020-06-01,http://www.denverpost.com/,Comments on: How I got hooked on sports: My “W...,
4,2020-06-01,https://www.post-gazette.com/,Gene Collier: Are we bored enough yet to bet o...,


In [7]:
df.dtypes

publishdate    object
src            object
title          object
news           object
dtype: object

In [8]:
df.isna().sum()

publishdate        0
src                0
title            833
news           52953
dtype: int64

### remove nan rows for 'title' and 'news'

In [9]:
df_clean = df.dropna(subset=['news', 'title'], axis=0)

In [10]:
df_clean.isna().sum()

publishdate    0
src            0
title          0
news           0
dtype: int64

### remove non english articles

In [11]:
def detect_language(article):
    language = detect(article)
    return language

In [12]:
%%time
df_clean = df_clean[df_clean['title'].map(detect_language)=='en']

### converting text from ISO-8859-1/latin1 to UTF-8

In [13]:
%%time

t_list, n_list = [], []
idx = 0
for t, n in zip(df_clean['title'], df_clean['news']):
    
    try:
        t_value = ftfy.fix_text(t)
    except:
        t_value = t
    
    try:
        n_value = ftfy.fix_text(n)
    except:
        n_value = n
        
    t_list.append(t_value)
    n_list.append(n_value)
    
#     print(idx, end=' ')
    idx+=1
    
df_clean['title'] = t_list
df_clean['news'] = n_list

### remove duplicates

In [14]:
df_clean = df_clean.drop_duplicates(subset=['news'])

### convert text to lower case

In [15]:
df_clean['title'] = df_clean['title'].str.lower()
df_clean['news'] = df_clean['news'].str.lower()

### filter relevant articles with drugs, addiction, epidemic

In [16]:
"""
sources:
    - https://www.drugabuse.gov/publications/media-guide/glossary
    - https://www.cdc.gov/drugoverdose/opioids/terms.html
    
"""

relev_NIH_words = ['Abstinence','Addiction','Agonist','Amphetamine','Anabolic',
                         'androgenic','steroids','Analgesics','Anesthetic','Antagonist','Barbiturate',
                         'Basal','ganglia','Benzodiazepine','Brainstem','Buprenorphine',
                         'Cannabidiol','CBD','Cannabinoid','receptor','Cannabinoids','Cannabis',
                         'Cardiovascular system','Central nervous system','CNS','Cerebellum',
                         'Cerebral cortex','Cerebral hemispheres','Cerebrum','CNS depressants',
                          'Cognition','Cognitive-behavioral therapy','CBT','Comorbidity',
                          'Contingency management','Craving','Dependence','Detoxification',
                          'Dopamine','Drug abuse','Drugged driving','Electronic cigarette',
                          'Flashback','Hallucinations','Hippocampus','Hypothalamus',
                          'Illicit','Impulsivity','Injection drug use','IDU','Intranasal',
                          'Limbic system','Mental disorder','Methadone','Motivational Enhancement Therapy',
                          'Naloxone','Naltrexone','Neonatal abstinence syndrome','NAS','Neurobiology',
                          'Neuron','nerve cell','Neurotransmitter','Norepinephrine','Nucleus accumbens',
                          'Opioid','opioids','Opioid receptors','Overdose','Paranoia','Partial agonist',
                          'Pharmacodynamics','Pharmacokinetics','Pharmacotherapy','Prefrontal cortex',
                          'Prescription drug misuse','Psychedelic drug','Psychoactive',
                          'Psychosis','Psychotherapeutics','Psychotropic','Receptor','Recovery',
                          'Relapse','Remission','Reward','Reward system','brain reward system',
                          'Risk factors','Route of administration','Self-medication','Serotonin',
                          'Stigma','Substance use disorder','sud','thc','Tolerance','Vaping','vape',
                          'Ventral striatum','Ventral tegmental area','Withdrawal','therapy'
]


relev_CDC_words = ['Acute','pain','Analgesics','Analog','Benzodiazepines',
                        'Chronic pain','Drug misuse','Drug addiction','Extended-release',
                        'long-acting','er','la','opioids','Fentanyl','Heroin','Illicit drugs',
                        'Immediate-release opioids','medication','assisted','treatment','mat',
                        'morphine','miligram','equivalents','mme','Naloxone','Narcotic drugs',
                        'nonmedical use','Non-opioid therapy','Non-pharmacologic therapy','opiates',
                        'Opioid analgesics','Natural opioid analgesics','Semi-synthetic opioid analgesics',
                        'Methadone','Synthetic opioid analgesics','synthetic','analgesics',
                        'Opioid use disorder','disorder','oud','dependence','Physical dependence',
                        'Prescription drug monitoring programs','prescription','monitor',
                        'Tolerance'
]


other_relev_words = ['health','addicted','epidemic']

words_list = [i.lower() for i in set(relev_NIH_words+relev_CDC_words+other_relev_words)]
bag_of_words = '|'.join(words_list)

title_matches = df_clean['title'].str.contains(bag_of_words)
news_matches = df_clean['news'].str.contains(bag_of_words)

total_matches = np.array([i or j for i,j in zip(title_matches, news_matches)])

df_clean = df_clean[total_matches]

### extract source name from URL (src)

In [17]:
df_clean['src_name'] = [i.split('/')[2].replace('www.','').replace('www-1.','').split('.')[0] for i in df_clean.src]

### save to file

In [18]:
df_clean.to_csv(NEWS.replace('.csv', '_cleaned.csv'), index=False)