In [None]:
import requests
import urllib.parse
from bs4 import BeautifulSoup
from datetime import datetime

import pandas as pd
import os

In [None]:
keyword_lists = {
    'relationship': [
        'relationship building',
        'relationship-building',
        'relationship maintenance',
        'peer relationship',
        'meaningful',
        'genuine',
        'closer',
        'interpersonal relationship',
        'personal relationship',
        'friendship',
        'intimate relationship',
        'close relationship',
        'closeness',
        'relationship',
        'relationship-building',
        'relatedness',
        'social connection',
        'companionship',
        'social bond',
        'co-presence',
        'romantic relationship',
        'inner circle',
        'social capital',
        'bonding',
        'peer interaction',
        'social engagement',
        'social penetration',
        'online relationship',
        'self-disclosure',
        'emotional connection',
        'empathy',
        'empathetic',
        'trust building',
        'intimacy',
        'social presence',
        'psychological safety',
        'mutual understanding',
    ],
    'design': [
        'design',
        'designing',
        'designer',
        'designed',
        'prototype',
        'prototyping',
        'deploy',
        'deployed',
        'deployment',
        'experiment',
    ],
    'platform': [
        'online conversation',
        'online communication',
        'social media',
        'social network',
        'social networking site',
        'SNS',
        'messaging',
        'digital message',
        'communication technology',
        'communication technologies',
        'networking platform',
        'digital community',
        'digital communities',
        'virtual community',
        'virtual communities',
        'online platform',
        'online chat',
        'private chat',
        'social platform',
        'group chat',
        'social media',
        'microblogging',
        'online interaction',
        'Facebook',
        'Twitter',
        'Instagram',
        'Snapchat',
        'TikTok',
        'WhatsApp',
        'WeChat',
        'Telegram',
        'Discord'
    ],
}


In [None]:
def generate_search_query(lists):
    queries = []
    for list in lists:
        if len(lists[list]) > 0:
            query = "(" + " OR ".join([f'("{item}" OR "{item}s")' for item in lists[list]]) + ")"
            queries.append(query)

    final_query = " AND ".join(queries)

    return '(' + final_query + ')'


search_query = generate_search_query(keyword_lists)
print(search_query)

In [None]:
def convert_query_to_url_encoded_string(query):
    encoded_query_ = urllib.parse.quote(query)
    encoded_query_ = encoded_query_.replace('%2A', '*')
    encoded_query_ = encoded_query_.replace('%20', '+')
    encoded_query_ = encoded_query_.replace('%2B', '+')
    return encoded_query_


encoded_query = convert_query_to_url_encoded_string(search_query)
print(encoded_query)

In [None]:
# test url
url = ('https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl' + \
       '&AllField=' + 'Abstract' + '%3A' + encoded_query + \
       '&ContentItemType=research-article&startPage=&PublisherRaw=PUB27')
print(url)

In [None]:
data = []

ls_fields = ['Keyword', 'Title', 'Abstract']

for field in ls_fields:
    print(field)
    url = ('https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl' + \
           '&AllField=' + field + '%3A' + encoded_query + '&ContentItemType=research-article&startPage=&PublisherRaw=PUB27')

    content = requests.get(url).text
    page = BeautifulSoup(content, 'lxml')

    num_res = int(page.find('span', attrs={'class': 'hitsLength'}).text.replace(',', '').strip())
    num_pages = int(num_res / 50) + 1
    print("N: ", num_res)
    print("num_pages: ", num_pages)

    for i in range(num_pages):
        print(i)
        url = ('https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl' + \
               '&AllField=' + field + '%3A' + encoded_query + '&pageSize=50&startPage=' + str(i))

        content = requests.get(url).text
        page = BeautifulSoup(content, 'lxml')

        for entry in page.find_all("div", attrs={"class": "issue-item__content"}):
            try:
                title = entry.find('h5', attrs={'class': 'issue-item__title'})
                author = entry.find('ul', attrs={'class': 'rlist--inline'})
                cite = entry.find('span', attrs={'class': 'citation'})
                conf = entry.find('span', attrs={'class': 'epub-section__title'})
                years = entry.find('span', attrs={'class': 'dot-separator'})
                url = 'https://dl.acm.org' + entry.a['href']

                try:
                    content = requests.get(url).text
                    page = BeautifulSoup(content, 'lxml')
                    div = page.find('div', attrs={'class': 'abstractInFull'})
                    abst = div.find('p')
                except Exception as e:
                    abst = entry.find('div', attrs={'class': 'issue-item__abstract'})
                    print(page)
                    print('=======')
                    print(div)
                    print(e)
                    print()
                    
                data.append({"title": title.text.replace('[PDF]', ''),
                             "url": url,
                             'authors': author.text.replace('\n', ''),
                             'abstract': abst.text.replace('\n', ''),
                             'conference': conf.text,
                             'citation': cite.text,
                             'year': years.text.split(',')[0].split(' ')[1],
                             'search_field': field})
            except Exception as e:
#                 print(e)
#                 print(entry)
                continue
    print()
        
df = pd.DataFrame(data)
df = df.drop_duplicates('title', inplace=False)


In [None]:
len(data)

In [None]:
len(df)

In [None]:
df

In [None]:
df.iloc[0]['abstract']

In [None]:
today_date = datetime.now().strftime('%Y-%m-%d')
filename = f'ACM_data_{today_date}.csv'

if os.path.exists(filename):
    df.to_csv(filename, mode='a', header=False, index=True)
else:
    df.to_csv(filename, header=True, index=True)
print(f'Data saved to {filename}')
