In [1]:
# Import libraries
import stanza
import pandas as pd
import string
import re
from tqdm import tqdm
import spacy

In [2]:
stanza.download('en')
nlp = stanza.Pipeline('en')
spacyToken = spacy.load("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 9.60MB/s]
2020-08-15 12:35:46 INFO: Downloading default packages for language: en (English)...
2020-08-15 12:35:47 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-08-15 12:35:50 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-08-15 12:35:50 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-08-15 12:35:51 INFO: Use device: gpu
2020-08-15 12:35:51 INFO: Loading: tokenize
2020-08-15 12:35:56 INFO: Loading: pos
2020-08-15 12:35:57 INFO: Loading: lemma
2020-08-15 12:35:57 INFO: Loading: depparse
2020-08-15 12:35:58 INFO: Loading: sentiment
2020-08-15 12:35:59 INFO: Loading: ner
2020-08-15 12:36:00 IN

In [3]:
df = pd.read_csv("./data/english.txt", sep="\t", header=None, names=["id", "sentence"])
# df.tail()

Unnamed: 0,id,sentence
9153,9996,The wounded soldiers were taken to a local hos...
9154,9997,"On Thursday, the BART Board is scheduled to co..."
9155,9998,The other problem is the lack of organs.
9156,9999,Analysts are calling for earnings of 59 cents ...
9157,10000,It's a thinking experience.


In [4]:
def tokenize(text):
    return [token.text for token in spacyToken.tokenizer(text)]

In [5]:
def shortSentence(text, size):
    tokens = text[:size]
    sentence = ""
    for txt in tokens:
        sentence = sentence + txt + " "
    return sentence

In [7]:
shortText = {'sentence':[]}
for i in tqdm(range(len(df))):
    text = tokenize(df['sentence'][i])
    text = shortSentence(text, 60)
    shortText['sentence'].append(text.rstrip())
shortText = pd.DataFrame(shortText)
shortText.to_csv(r'./data/eng-shortText.txt', sep='\t', index = False)

100%|██████████| 9158/9158 [00:01<00:00, 5310.28it/s]


In [9]:
df = pd.read_csv("./data/eng-shortText.txt", sep="\t", header=None, names=["sentence"])
# df.tail()

Unnamed: 0,sentence
9154,The wounded soldiers were taken to a local hos...
9155,"On Thursday , the BART Board is scheduled to c..."
9156,The other problem is the lack of organs .
9157,Analysts are calling for earnings of 59 cents ...
9158,It 's a thinking experience .


In [14]:
token_lens = []
a = 0
for txt in df.sentence:
  tokens = tokenize(txt)
  b = len(tokens)
  if b > a:
    a = b
  token_lens.append(len(tokens))
print("Maximum number of tokens in the dataset:", a)

Maximum number of tokens in the dataset: 61


In [15]:
def processLemmatization(text):
    word = ""
    lemma = ""
    punc= set(string.punctuation)
    doc = nlp(text)
    for sent in doc.sentences:
        for wrd in sent.words:
            c = str(wrd.text)
            d = str(wrd.lemma)
            if any(char in punc for char in d)==True:
                word = word.lstrip()
                lemma = lemma.lstrip()
                word += c + " "
                lemma += d + " "
            else:
                word += c + " "
                lemma += d + " "
    return word, lemma

In [16]:
parsed_text = {'sentence':[], 'lemma':[]}
for i in tqdm(range(len(df))):
    word, lemma = processLemmatization(df['sentence'][i])
    #extract text and lemma
    parsed_text['sentence'].append(word.rstrip())
    parsed_text['lemma'].append(lemma.rstrip())
text = pd.DataFrame(parsed_text)
# text.to_csv(r'./data/eng-lem.txt', sep='\t', index = False)
text.to_csv(r'./data/eng-lem.txt', sep='\t', index = False)

100%|██████████| 9159/9159 [12:26<00:00, 12.27it/s]


In [17]:
df = pd.read_csv("./data/eng-lem.txt", sep="\t", header=None, names=["sentence"])
# df.tail()

Unnamed: 0,sentence
"The wounded soldiers were taken to a local hospital , the official said .",the wounded soldier be take to a local hospita...
"On Thursday , the BART Board is scheduled to consider proposals to raise fees and charge for parking at stations throughout the system in Berkeley and Oakland .","on Thursday , the BART Board be schedule to co..."
The other problem is the lack of organs .,the other problem be the lack of organ .
"Analysts are calling for earnings of 59 cents a share . After the bell , Genentech will reveal its first - quarter results .",analyst be call for earning of 59 cent a share...
It 's a thinking experience .,it be a thinking experience .
