# Data preparation and preprocessing of Test Data for evaluation

In [1]:
import pandas as pd
import csv
import numpy as np
from tqdm import tqdm
import re
import sys
sys.path.append("..")
from utils import parsingCorpus, writeFile, readFile, text_cleaner
print("Libraries imported!")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 9.38MB/s]
2020-08-29 23:28:12 INFO: Downloading default packages for language: en (English)...
2020-08-29 23:28:12 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-08-29 23:28:15 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-08-29 23:28:15 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-08-29 23:28:16 INFO: Use device: gpu
2020-08-29 23:28:16 INFO: Loading: tokenize
2020-08-29 23:28:18 INFO: Loading: pos
2020-08-29 23:28:19 INFO: Loading: lemma
2020-08-29 23:28:19 INFO: Loading: depparse
2020-08-29 23:28:20 INFO: Loading: sentiment
2020-08-29 23:28:21 INFO: Loading: ner
2020-08-29 23:28:21 IN

In [2]:
df = pd.read_csv("./data/dataset/dataset_coca.txt", sep='\t', usecols=[0,1], header=0, quoting=csv.QUOTE_NONE, encoding='utf-8')
print("Data imported!")

Data imported!


In [3]:
# Remove empty rows
df['Section'].replace('', np.nan, inplace=True)
df['section'].replace('', np.nan, inplace=True)
df.dropna(subset=['Section', 'section'], axis=0, inplace=True)
print("Removed empty rows")

Removed empty rows


In [4]:
dataset = np.array_split(df, 12)

In [5]:
parsingCorpus(dataset[0], "sentence", "lemma")

100%|██████████| 358496/358496 [01:54<00:00, 3142.53it/s]


Done sentences & lemma!


In [6]:
# Read the dataset with joined sentences and lemmas
sentence = pd.DataFrame(readFile("sentence.txt"), columns=["sentence"])
lemma = pd.DataFrame(readFile("lemma.txt"), columns=["lemma"])

In [7]:
# Remove empty rows
sentence['sentence'].replace('', np.nan, inplace=True)
lemma['lemma'].replace('', np.nan, inplace=True)
sentence.dropna(subset=['sentence'], axis=0, inplace=True)
lemma.dropna(subset=['lemma'], axis=0, inplace=True)
print("Removed empty rows")

Removed empty rows


In [8]:
sentence = sentence["sentence"].apply(text_cleaner)
lemma = lemma["lemma"].apply(text_cleaner)

In [9]:
sentence.to_csv("./cleaned/sentence.txt", sep='\t', index=False, encoding='utf-8')
lemma.to_csv("./cleaned/lemma.txt", sep='\t', index=False, encoding='utf-8')

In [10]:
# Import the processed dataset
sentence = pd.read_csv("./data/cleaned/sentence.txt", names=["InputText"], quoting=csv.QUOTE_NONE, encoding='utf-8')
lemma = pd.read_csv("./data/cleaned/lemma.txt", names=["OutputText"], quoting=csv.QUOTE_NONE, encoding='utf-8')

In [12]:
 df = pd.concat([sentence, lemma], axis=1, sort=False)

In [14]:
df.shape

(11991, 2)

In [15]:
# Remove sentences shorter than 5 words
df = df[df['InputText'].str.split().str.len().gt(5)]
df = df[df['InputText'].str.split().str.len().lt(50)]

In [16]:
df.shape

(10220, 2)

In [17]:
df.to_csv("./data/test/test-sample.txt", sep='\t', index=False, encoding='utf-8')