In [None]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
import re
import unicodedata

In [None]:
#funções úteis

def reject_outliers(data, m=2):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

#Criando Database de idiomas

In [None]:
#Essa célula extrai os idiomas disponíveis para digitação e escolhe os 10 idiomas mais relevantes para estudo

response = get('https://www.keyhero.com/typing-statistics/')
html_soup = BeautifulSoup(response.text,'html.parser')
languages_table = html_soup.find('table', class_ = 'table table-striped table-condensed table-bordered')

df_lang = pd.DataFrame(columns = ['Name', 'Number of Quotes', 'Link'])

for language in languages_table.find_all('tr')[1:]:
  name = language.a.text
  num_quotes = language.find(class_='text-right').text
  link = language.a['href']
  df_lang = df_lang.append({'Name': name, 'Number of Quotes' : int(num_quotes), 'Link' : link}, ignore_index=True)

#Ordena pela quantidade de citações disponíveis para cada idioma em ordem decrescente
df_lang = df_lang.sort_values('Number of Quotes',  ascending=False)

#Pega os 10 idiomas com maior número de citações públicadas
df_lang = df_lang.iloc[:10]

In [None]:
print(df_lang)
df_lang.to_csv("langs.csv")

                             Name Number of Quotes        Link
3                         English            22433   /quotes/0
24              Español (Spanish)             1979   /quotes/2
8               Français (French)             1102   /quotes/1
16               Türkçe (Turkish)              583  /quotes/21
1               Svenska (Swedish)              304   /quotes/5
15         Português (Portuguese)              241   /quotes/3
11               Deutsch (German)              209   /quotes/6
19                Polish (Polski)              197  /quotes/16
12             Italian (Italiano)              186   /quotes/4
21  Bahasa Indonesia (Indonesian)              156   /quotes/7


#Extraindo Citações de cada um dos Idiomas

In [None]:
#Para cada um dos 10 idiomas extraimos 60 citações

df_quotes = pd.DataFrame(columns = ['Idx', 'Lang'])

for _, lang in df_lang.iterrows():
  lang_link = lang['Link']
  lang_name = lang['Name']
  url = 'https://www.keyhero.com/' + lang_link + '?best&page='
  for i in range(1, 4):
    response = get(url + str(i))
    html_soup = BeautifulSoup(response.text, 'html.parser')
    for quote in html_soup.find_all('div' , class_ = 'rating-block'):
      m = re.match(r"^rating(\w+)", quote['id'])
      idx = int(m[1])
      df_quotes = df_quotes.append({'Idx': idx, 'Lang' : lang_name}, ignore_index=True)
    sleep(randint(1,3))


In [None]:
print(df_quotes)
df_quotes.to_csv("quotes.csv")

       Idx                           Lang
0    17695                        English
1    15052                        English
2    11520                        English
3     4764                        English
4    14667                        English
..     ...                            ...
595  13604  Bahasa Indonesia (Indonesian)
596  13291  Bahasa Indonesia (Indonesian)
597  13292  Bahasa Indonesia (Indonesian)
598  13601  Bahasa Indonesia (Indonesian)
599  13605  Bahasa Indonesia (Indonesian)

[600 rows x 2 columns]


#Extraindo mais dados sobre cada citação

In [None]:
# Função para fazer request de uma página de citação a partir do index da citação
def request_quote_page(index):
    # Requisição da pagina
    url = 'https://www.keyhero.com/online-typing-test/' + index + '/'
    response = get(url)

    # Parsing
    html_soup = BeautifulSoup(response.text, 'html.parser')
    return html_soup

In [None]:
def scrap_quote_page(html_soup):
    # Pegando citação
    quote_container = html_soup.find('div', class_ = 'quoteinfo')
        # .next_sibling => a citação não se encontra no quoteinfo, mas logo em seguida
        # .strip() => pega somente a citação, sem isso possui quebra de linha e espaçamento
    quote = quote_container.next_sibling.strip()
    title = html_soup.find('h3').text

    # Pegando estatísticas
    statistics_container = html_soup.find_all('table')
    wpm_top = []
    acc_top = []
    wpm_new = []
    acc_new = []

    # Pegando melhores
    for row in statistics_container[0].find_all('tr'):
        if(row.find('td', class_ = 'text-right')):
            wpm_top.append(row.find_all('td', class_ = 'text-right')[0].text)
            acc_top.append(row.find_all('td', class_ = 'text-right')[1].text)

    # Pegando mais recentes
    for row in statistics_container[1].find_all('tr'):
        if(row.find('td')):
            wpm_new.append(row.find_all('td')[1].text)
            acc_new.append(row.find_all('td')[2].text)

    # Corrigindo as listas
        # Retira o %
    acc_top = [x[:-1] for x in acc_top] 
    acc_new = [x[:-1] for x in acc_new]
        # Transforma em lista de floats
    wpm_top = list(map(float, wpm_top)) 
    wpm_new = list(map(float, wpm_new))
    acc_top = list(map(float, acc_top))
    acc_new = list(map(float, acc_new))

    #Removendo os outliers de wpm (possíveis bots com uma pontuação muito superior)
    wpm_top = reject_outliers(np.array(wpm_top))
    wpm_new = reject_outliers(np.array(wpm_new))
    # Calculando as médias
    wpm_top_average = np.mean(wpm_top)
    acc_top_average = np.mean(acc_top)
    wpm_new_average = np.mean(wpm_new)
    acc_new_average = np.mean(acc_new)

    new_row = pd.DataFrame(data = [[title, quote, wpm_top_average, acc_top_average, wpm_new_average, acc_new_average]], columns = ['title', 'quote', 'average_wpm_top', 'average_acc_top', 'average_wpm_new', 'average_acc_new'])
    return new_row

In [None]:
# Criando dataset
dataset = pd.DataFrame(columns = ['title', 'quote', 'average_wpm_top', 'average_acc_top', 'average_wpm_new', 'average_acc_new'])

#Criando database de citações
counter = 0
for index in df_quotes['Idx']:
    counter += 1
    print(counter)
    html_soup = request_quote_page(str(index))
    new_row = scrap_quote_page(html_soup)
    dataset = dataset.append(new_row, ignore_index = True)
    #espera de 1 a 3 segundos entre um request e outro
    sleep(randint(1,3))

dataset['idioma'] = df_quotes['Lang']

In [None]:
print(dataset)
dataset.to_csv('dataset.csv')

                                                 title  ...                         idioma
0                 How important you are. - Fred Rogers  ...                        English
1                                Alexander Graham Bell  ...                        English
2                                Forgive - Brian Weiss  ...                        English
3                              Martin Luther King, Jr.  ...                        English
4                                     Power - Bob Ross  ...                        English
..                                                 ...  ...                            ...
595           Bunga Kebun Tanjong - Muhammad Nasir Age  ...  Bahasa Indonesia (Indonesian)
596  Suamiku Jatuh Cinta Pada Jam Dinding - Arswend...  ...  Bahasa Indonesia (Indonesian)
597  Suamiku Jatuh Cinta Pada Jam Dinding - Arswend...  ...  Bahasa Indonesia (Indonesian)
598                   Tandan Sawit - Nafi'ah Al-Ma'rab  ...  Bahasa Indonesia (Indonesian)

#Extração de informações textuais das Citações

* Quantidade de caracteres (tamanho do texto)
* Extrair tamanho médio das palavras. 
* Quantidade de letras maiúsculas
* Quantidade de caracteres não alfanuméricos (expandir o conceito de alfanumérico para aceitar caracteres de outras línguas, como o turco)
* Quantidade de acentos

In [None]:
dataset_inicial = pd.read_csv("dataset.csv", index_col = 0)
print(dataset_inicial)

                                                 title  ...                         idioma
0                 How important you are. - Fred Rogers  ...                        English
1                                Alexander Graham Bell  ...                        English
2                                Forgive - Brian Weiss  ...                        English
3                              Martin Luther King, Jr.  ...                        English
4                                     Power - Bob Ross  ...                        English
..                                                 ...  ...                            ...
595           Bunga Kebun Tanjong - Muhammad Nasir Age  ...  Bahasa Indonesia (Indonesian)
596  Suamiku Jatuh Cinta Pada Jam Dinding - Arswend...  ...  Bahasa Indonesia (Indonesian)
597  Suamiku Jatuh Cinta Pada Jam Dinding - Arswend...  ...  Bahasa Indonesia (Indonesian)
598                   Tandan Sawit - Nafi'ah Al-Ma'rab  ...  Bahasa Indonesia (Indonesian)

In [None]:
def info_from_quote(quote):
  words = re.findall(r"\w+(?:'\w)?\w*", quote, re.UNICODE)
  nfkd_form = unicodedata.normalize('NFKD', quote)

  length = len(quote)
  avg_word_len = (float(sum([len(w) for w in words])) / len(words))
  upper_case = sum(1 for c in quote if c.isupper())
  not_alpha = len(re.findall(r'[^\w\s]', quote))
  accent = sum(map(lambda c: 1 if (unicodedata.combining(c) != 0) else 0, nfkd_form))

  return length, avg_word_len, upper_case, not_alpha, accent

In [None]:
length = []
avg_word_len = []
upper_case = []
not_alpha = []
accent = []

for quote in dataset_inicial['quote']:
    ret = info_from_quote(quote)
    length.append(ret[0])
    avg_word_len.append(ret[1])
    upper_case.append(ret[2])
    not_alpha.append(ret[3])
    accent.append(ret[4])

dataset_inicial['length'] = length
dataset_inicial['avg_word_len'] = avg_word_len
dataset_inicial['upper_case'] = upper_case
dataset_inicial['not_alpha'] = not_alpha
dataset_inicial['accent'] = accent

print(dataset_inicial)
dataset_inicial.to_csv("dataset_final.csv")

                                                 title  ... accent
0                 How important you are. - Fred Rogers  ...      0
1                                Alexander Graham Bell  ...      0
2                                Forgive - Brian Weiss  ...      0
3                              Martin Luther King, Jr.  ...      0
4                                     Power - Bob Ross  ...      0
..                                                 ...  ...    ...
595           Bunga Kebun Tanjong - Muhammad Nasir Age  ...      0
596  Suamiku Jatuh Cinta Pada Jam Dinding - Arswend...  ...      0
597  Suamiku Jatuh Cinta Pada Jam Dinding - Arswend...  ...      0
598                   Tandan Sawit - Nafi'ah Al-Ma'rab  ...      0
599           Bunga Kebun Tanjong - Muhammad Nasir Age  ...      0

[600 rows x 12 columns]
