# Get existing entries from DB

In [2]:
import boto3
import dotenv

dotenv.load_dotenv()

client = boto3.client('dynamodb')


tableName = 'theFieldInclusiveLanguageToolLabelling'

response = client.scan(
    TableName=tableName,
)

print(f"{len(response['Items'])} entries")

existing = {}

for item in response['Items']:
    sentence = item['sentence']['S']
    category = item['category']['S']
    if not existing.get(category, False):
        existing[category] = set()
    existing[category].add(sentence)

#print(existing)

for category in existing.keys():
    print(f"Got {len(existing[category])} for category '{category}'")

2022-07-07 16:39:44,137 INFO Found credentials in environment variables.
849 entries
Got 35 for category 'Special needs'
Got 35 for category 'autistic'
Got 35 for category 'autism'
Got 35 for category 'bipolar'
Got 35 for category 'High Functioning'
Got 35 for category 'Low functioning'
Got 35 for category 'ADHD'
Got 35 for category 'deaf to'
Got 35 for category 'aspergers'
Got 35 for category 'blind eye'
Got 35 for category 'upstanding'
Got 35 for category 'blindly'
Got 35 for category 'dumb'
Got 15 for category 'attention deficit hyperactivity disorder'
Got 35 for category 'psychotic'
Got 35 for category 'High-functioning'
Got 35 for category 'handicapped'
Got 35 for category 'blind'
Got 35 for category 'crippled'
Got 23 for category 'attention deficit disorder'
Got 35 for category 'OCD'
Got 35 for category 'blinded by'
Got 35 for category 'low-functioning'
Got 35 for category 'Incapacitated'
Got 1 for category 'blind-eye'
Got 35 for category 'cripple'
Got 5 for category 'ADD'


# Scrape new entries

In [3]:
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
import time
from bs4 import BeautifulSoup
import json
import re

keywords = """deaf to
High Functioning
High-functioning
Low functioning
low-functioning
handicapped
bipolar
blind-eye
blind eye
blinded by
upstanding
Incapacitated
Special needs
aspergers
blind
dumb
psychotic
ADD
attention deficit disorder
ADHD
attention deficit hyperactivity disorder
OCD
crippled
cripple
blindly
autistic
autism
disability
disabled
paraplegic
paraplegia
quadriplegia
quadriplegic
differently abled
differently-abled
blinded by
neurotypical
bipolar disorder
deaf and dumb"""

options = webdriver.FirefoxOptions()
options.headless = True
driver = webdriver.Firefox(
    options=options,
    executable_path=GeckoDriverManager().install())

data = {}

overall_count = 0

for keyword in keywords.split('\n'):
    data[keyword] = []
    driver.get('https://www.quora.com/search?q="' + keyword + '"&type=answer')
    time.sleep(2)

    last_len = 0
    no_new_posts_retrial_count = 0
    sentences = existing.get(keyword, set())

    print(f"starting with keyword '{keyword}', already have {len(sentences)} sentences")

    while True:
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")

        time.sleep(1)

        page_source = driver.page_source

        soup = BeautifulSoup(
            page_source.encode("utf-8"), "html.parser")

        posts = [post for post in soup.findAll('span', {'class': 'qt_truncated_inline'}) if post.get_text(
        ) != '' and not post.get_text().isspace()]

        for post in posts:
            for sentence in re.split('[.\n!?]', post.get_text()):
                if re.search((keyword if keyword.isupper() else f'{keyword}|{keyword.lower()}|{keyword.title()}'), sentence):
                    sentences.add(sentence)
        
        if last_len != len(sentences):
            last_len = len(sentences)
            no_new_posts_retrial_count = 0
        else:
            no_new_posts_retrial_count += 1
            if no_new_posts_retrial_count >= 10:
                no_new_posts_retrial_count = 0
                break
        
        if len(sentences) >= 60:
            no_new_posts_retrial_count = 0
            break

    print(f"have {len(sentences)} sentences for '{keyword}'")
    data[keyword] = list(sentences)[:60]
    overall_count += 60

with open('data.json', 'w') as file:
    json.dump(obj=data, fp=file)









[WDM] - Current firefox version is 102.0


2022-07-07 16:39:54,043 INFO Current firefox version is 102.0


[WDM] - Get LATEST geckodriver version for 102.0 firefox


2022-07-07 16:39:54,044 INFO Get LATEST geckodriver version for 102.0 firefox


[WDM] - Driver [C:\Users\reece\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


2022-07-07 16:39:54,470 INFO Driver [C:\Users\reece\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


  driver = webdriver.Firefox(


starting with keyword 'deaf to', already have 35 sentences
have 62 sentences for 'deaf to'
starting with keyword 'High Functioning', already have 35 sentences
have 63 sentences for 'High Functioning'
starting with keyword 'High-functioning', already have 35 sentences
have 52 sentences for 'High-functioning'
starting with keyword 'Low functioning', already have 35 sentences
have 60 sentences for 'Low functioning'
starting with keyword 'low-functioning', already have 35 sentences
have 37 sentences for 'low-functioning'
starting with keyword 'handicapped', already have 35 sentences
have 63 sentences for 'handicapped'
starting with keyword 'bipolar', already have 35 sentences
have 82 sentences for 'bipolar'
starting with keyword 'blind-eye', already have 1 sentences
have 1 sentences for 'blind-eye'
starting with keyword 'blind eye', already have 35 sentences
have 67 sentences for 'blind eye'
starting with keyword 'blinded by', already have 35 sentences
have 64 sentences for 'blinded by'
st