# Data pre-processing using Stanza Lemmatization

In [1]:
# Import libraries
import stanza
import pandas as pd
import string
import re
from tqdm import tqdm
import spacy

In [2]:
stanza.download('en')
nlp = stanza.Pipeline('en')
spacyToken = spacy.load("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 6.57MB/s]
2020-08-17 10:46:06 INFO: Downloading default packages for language: en (English)...
2020-08-17 10:46:06 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-08-17 10:46:09 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-08-17 10:46:09 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-08-17 10:46:11 INFO: Use device: gpu
2020-08-17 10:46:11 INFO: Loading: tokenize
2020-08-17 10:46:13 INFO: Loading: pos
2020-08-17 10:46:14 INFO: Loading: lemma
2020-08-17 10:46:14 INFO: Loading: depparse
2020-08-17 10:46:15 INFO: Loading: sentiment
2020-08-17 10:46:15 INFO: Loading: ner
2020-08-17 10:46:16 IN

> Dataset consists on 10,000 sentences. Each line has an id and a sentences.

In [3]:
df = pd.read_csv("./data/english-10k.txt", sep="\t", header=None, names=["id", "sentence"])
df.tail()

Unnamed: 0,id,sentence
9153,9996,The wounded soldiers were taken to a local hos...
9154,9997,"On Thursday, the BART Board is scheduled to co..."
9155,9998,The other problem is the lack of organs.
9156,9999,Analysts are calling for earnings of 59 cents ...
9157,10000,It's a thinking experience.


> A function to tokenize a sentence using spacy tokenizer. The function takes a sentence as an input argument

In [4]:
def tokenize(text):
    return [token.text for token in spacyToken.tokenizer(text)]

> A function to make a sentence shorter. This function takes the sentence and number of tokens a user desires in the sentence.

In [5]:
def shortSentence(text, size):
    tokens = text[:size]
    sentence = ""
    for txt in tokens:
        sentence = sentence + txt + " "
    return sentence

> The sentences in the dataset is tokenized and then made shorter. The number of tokens in each sentences is chosen 60.

In [6]:
shortText = {'sentence':[]}
for i in tqdm(range(len(df))):
    text = tokenize(df['sentence'][i])
    text = shortSentence(text, 60)
    shortText['sentence'].append(text.rstrip())
shortText = pd.DataFrame(shortText)
shortText.to_csv(r'./data/eng-shortText.txt', sep='\t', index = False)

100%|██████████| 9158/9158 [00:01<00:00, 5050.20it/s]


In [7]:
df = pd.read_csv("./data/eng-shortText.txt", sep="\t", header=None, names=["sentence"])
df.tail()

Unnamed: 0,sentence
9154,The wounded soldiers were taken to a local hos...
9155,"On Thursday , the BART Board is scheduled to c..."
9156,The other problem is the lack of organs .
9157,Analysts are calling for earnings of 59 cents ...
9158,It 's a thinking experience .


In [8]:
token_lens = []
a = 0
for txt in df.sentence:
  tokens = tokenize(txt)
  b = len(tokens)
  if b > a:
    a = b
  token_lens.append(len(tokens))
print("Maximum number of tokens in a sentence:", a)

Maximum number of tokens in a sentence: 61


> This is a function that will process the lemmatization using stanfordnlp stanza library.

In [9]:
def processLemmatization(text):
    word = ""
    lemma = ""
    punc= set(string.punctuation)
    doc = nlp(text)
    for sent in doc.sentences:
        for wrd in sent.words:
            c = str(wrd.text)
            d = str(wrd.lemma)
            if any(char in punc for char in d)==True:
                word = word.lstrip()
                lemma = lemma.lstrip()
                word += c + " "
                lemma += d + " "
            else:
                word += c + " "
                lemma += d + " "
    return word, lemma

> The new dataset with 60 tokens in each sentence will be used as input. Each sentence will be passed and create the lemmatized dataset.

In [10]:
parsed_text = {'sentence':[], 'lemma':[]}
for i in tqdm(range(len(df))):
    word, lemma = processLemmatization(df['sentence'][i])
    #extract text and lemma
    parsed_text['sentence'].append(word.rstrip())
    parsed_text['lemma'].append(lemma.rstrip())
text = pd.DataFrame(parsed_text)
# text.to_csv(r'./data/eng-lem.txt', sep='\t', index = False)
text.to_csv(r'./data/eng-lem.txt', sep='\t', index = False)

100%|██████████| 9159/9159 [12:13<00:00, 12.49it/s]


In [11]:
df = pd.read_csv("./data/eng-lem.txt", sep="\t", header=None, names=["sentence", "lemma"])
df.tail()

Unnamed: 0,sentence,lemma
9155,The wounded soldiers were taken to a local hos...,the wounded soldier be take to a local hospita...
9156,"On Thursday , the BART Board is scheduled to c...","on Thursday , the BART Board be schedule to co..."
9157,The other problem is the lack of organs .,the other problem be the lack of organ .
9158,Analysts are calling for earnings of 59 cents ...,analyst be call for earning of 59 cent a share...
9159,It 's a thinking experience .,it be a thinking experience .
