In [None]:
# importing libraries

import googlesearch as g
import time
import random
import requests
from urllib.parse import urlparse

In [None]:
# connecting to google drive in order to pull txt files

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# getting the date and time as strings

from datetime import datetime

now = datetime.now() # get now using datetime
dt = now.strftime("%d/%m/%Y %H:%M:%S") # date and time given as dd/mm/YY H:M:S
dt_string = str(dt) # reformat as a string

dt_list = list(dt_string.split()) # separating out date and time
date = "Date: " + str(dt_list[0]) + "\n" # reformating date by itself
time = "Time: " + str(dt_list[1]) + "\n" # reformating time by itself

In [None]:
# putting the keywords in a list of strings
# each keyword with capitalization format so that it can later be used as search term as well as title of txt file

keywords = ["Corporate Health", "Top News", "Menstrual Health", "Athletes", "Research"]

In [None]:
# populating the original empty txt files with first round of links

for i in keywords:
  populateOriginalFile(i)

In [None]:
# adding new links to the current list of links from the txt files

for i in keywords:
  readAndReplace(i)

In [None]:
# function to create original list of liks and add to the empty txt files
# input: string topic to be used in keyword google search
# output: none

def populateOriginalFile(topic):
  url_list = generateLinks(topic)
  url_string = listToString(url_list)
  writeFile(topic, url_string)

In [None]:
# function to take the links from the current txt file, add new links, and update the txt file
# input: string topic to be used in keyword google search
# output: none

def readAndReplace(topic):
    url_old = readFile(topic)
    url_string = listToString(addLinks(topic, url_old))
    writeFile(topic, url_string)

In [None]:
# function to generate initial list of articles
# input: string keyword used for google search
# output: list urls of url links

def generateLinks(keyword):
  urls = list(g.search("women's health " + keyword,stop=1000, lang='en')) # search for articles, put links in a list

  # make sure that the links added have unique domains to not repeat the same sources
  urls_final = []
  unique_domains = set()
  for i in urls:
    domain = urlparse(i).netloc
    if domain not in unique_domains:
      unique_domains.add(domain)
      urls_final.append(i)

  random.shuffle(urls_final) # shuffle the list of articles so not in any order, get more variety
  return urls_final

In [None]:
# function to convert the list of urls to a string format so it can be added to the txt file
# input: list url_lists
# output: string url_string

def listToString(url_list):
    url_string = ""
    for i in url_list:
        url_string += i + ", " # separating links with ", " so it still looks like a list
    url_string = url_string[:-2] # getting rid of the final ", "
    return url_string

In [None]:
# function to read txt files, store the contents, and convert from a string to list format so the list can be updated
# input: string title to locate the correct txt file
# output: list url_current which is list of urls contained in the txt file

def readFile(title):
  with open('/content/drive/My Drive/' + title + ' Article Links.txt', 'r') as f:
    next(f) # skip first line which is the date
    next(f) # skip second line which is the time
    url_read = f.read() # begin reading 2 lines down, url_read is a string
    f.close()
  url_current = list(url_read.split(", ")) # convert content from a string to a list
  return url_current

In [None]:
# function to take in the current list of articles, add new links if not already listed
# input: string keyword, list url_old which is the current list of articles in the txt file
# output: list url_all which is the updated list of old and new articles, no repeats

def addLinks(keyword, url_old):
  url_all = url_old # add the current list of articles to the final list
  url_new = list(g.search("women's health " + keyword,stop=1000, lang='en')) # search for articles, put links in a list

  # make a list of the domains that are in the current list of links
  unique_domains = set()
  for i in url_all:
    domain = urlparse(i).netloc
    if domain not in unique_domains:
        unique_domains.add(domain)

  # for the new articles found, check that they are not already in the list and have unique domains
  for i in url_new:
    if i not in url_all: # check that the newly found link is not already in url_all before adding
      domain = urlparse(i).netloc
      if domain not in unique_domains:
        unique_domains.add(domain)
        url_all.append(i)

  random.shuffle(url_all) # shuffle the list of articles so not in any order, get more variety
  return url_all

In [None]:
# function to clear out current txt file and put in date/time and updated links
# input: string title to locate the txt file, string content which is the updated links to populate the txt file
# output: none

def writeFile(title, content):
  with open('/content/drive/My Drive/' + title + ' Article Links.txt', 'w') as f:
    f.truncate(0) # clear out current contents of the file
    f.write(date) # add in the date
    f.write(time) # add in the time
    f.write(content) # add in the updated list of links as a string
    f.close()