In [75]:
import langdetect
import nltk
import gutenberg
from gutenberg.query import get_etexts
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import re
from os import listdir
from os.path import isfile, join
import pandas as pd
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

In [16]:
def get_language_stopword(input_text):
    input_words = nltk.wordpunct_tokenize(input_text)

    likelihood = {}
    for language in nltk.corpus.stopwords._fileids:
        likelihood[language] = len(set(input_words) & set(nltk.corpus.stopwords.words(language)))
    language = sorted(likelihood, key=likelihood.get, reverse=True)[0]
    likelihood = {k:v for k, v in likelihood.items() if v}
    return(language, likelihood)

In [152]:
def get_language_google(input_text):
    """
    This function uses Google's package 'langdetect' to detect the language
    of a sentence.
    """
    likelihood = {}
    for item in langdetect.detect_langs(input_text):
        likelihood[item.lang] = item.prob
    return(likelihood)

In [18]:
eng_test_string = "This is an English test string for the function."
span_test_string = "Esta es una cadena de prueba en inglés para la función." 
port_test_string = "Esta é uma string de teste em inglês para a função."

In [19]:
eng_x, eng_y = get_language_stopword(eng_test_string)
span_x, span_y = get_language_stopword(span_test_string)
port_x, port_y = get_language_stopword(port_test_string)

In [20]:
print(eng_y)
print(eng_x)
print(span_y)
print(span_x)
print(port_y)
print(port_x)

{'dutch': 1, 'norwegian': 1, 'english': 4, 'danish': 1, 'portuguese': 1, 'german': 1}
english
{'italian': 2, 'danish': 2, 'norwegian': 2, 'portuguese': 2, 'spanish': 6, 'dutch': 2, 'turkish': 2, 'hungarian': 1, 'finnish': 1, 'french': 4, 'swedish': 2, 'german': 1}
spanish
{'spanish': 3, 'italian': 1, 'turkish': 1, 'norwegian': 1, 'french': 1, 'english': 1, 'dutch': 1, 'danish': 1, 'hungarian': 2, 'portuguese': 5, 'swedish': 1}
portuguese


In [21]:
eng_google = get_language_google(eng_test_string)
span_google = get_language_google(span_test_string)
port_google = get_language_google(port_test_string)

In [22]:
print(eng_google)
print(span_google)
print(port_google)

en
es
pt


In [23]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: 
        text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [214]:
print(get_etexts('title', 'Moby Dick'))

frozenset()


In [215]:
from gutenberg.acquire import get_metadata_cache

ImportError: cannot import name 'get_metadata_cache'

In [36]:
from gutenberg.query import get_metadata

In [216]:
get_metadata('title', 52336)

frozenset()

In [71]:
data_path = '../data/'
files = [f for f in listdir(data_path) if isfile(join(data_path, f))]

In [66]:
d = {}
for item in langdetect.detect_langs(span_test_string+eng_test_string):
    d[item.lang] = item.prob

In [217]:
text = strip_headers(load_etext(52336)).strip()

INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us
INFO:requests.packages.urllib3.connectionpool:Starting new HTTP connection (1): www.gutenberg.lib.md.us


In [80]:
text[0:100]

"MOBY DICK; OR THE WHALE\n\nBy Herman Melville\n\n\n\n\nOriginal Transcriber's Notes:\n\nThis text is a combin"

In [84]:
def my_encoder(self, my_string):
    try:
        return unicode(my_string).encode()
    except UnicodeDecodeError:
        return 'DecodeError'.encode()

In [189]:
with open('../data/candide.txt') as f:
    content = f.readlines()

In [96]:
all_text = " ".join(content)

In [219]:
sentences = split_into_sentences(text)

In [223]:
cleaned_sentences = []
for sentence in sentences:
    words = sentence.strip().split(" ")
    if len(words) >= 2:
        cleaned_sentences.append(" ".join(words))

In [224]:
df = pd.DataFrame(cleaned_sentences, columns=['Sentences'])

In [225]:
df['google_language'] = df['Sentences'].apply(lambda x: get_language_google(x))

In [230]:
df['google_language'][0]

{'fi': 0.9999972065764102}

In [195]:
((139*6)/5)*1.2

200.16

In [199]:
test = pd.read_csv('../Code/sentence_langs.csv')

In [208]:
test = test[list(test.columns[1:])]

In [211]:
test.head()

Unnamed: 0,sentence,num_words,google_language,book_name
0,"?The Project Gutenberg EBook of Candide, by Vo...",42,{'en': 0.9999959904510038},candide
1,"You may copy it, give it away or re-use it und...",28,{'en': 0.9999967884658534},candide
2,"If you are not located in the United States, y...",27,{'en': 0.9999968148627227},candide
3,Title: Candide Author: Voltaire Translat...,93,"{'et': 0.7142846803585882, 'fi': 0.28571493117...",candide
4,VOLTAIRE Suom.,7,"{'pt': 0.14285637648722269, 'fi': 0.8571400997...",candide


In [213]:
test['google_lanuga']

sentence           Oy Weilin & G??s kirjapainossa.
num_words                                        5
google_language         {'fi': 0.9999965275396452}
book_name                                  candide
Name: 6, dtype: object