# Notebook: Clean sentences

This notebook loads in the raw Reuters sentence pool and performs some basic clean-up and filtering operations:

- seperate sentences into words
- count number of words per sentence
- filter out sentences with less than 5 or more than 50 words (for BERT)
- store these in a .txt file

In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import numpy as np

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
sent_df = pd.read_csv('reuters_sentence_pool.csv', index_col=0)

In [5]:
sent_df['sent_nlp'] = sent_df['sentences'].apply(nlp)

In [6]:
sent_df['sent_len'] = sent_df['sent_nlp'].apply(len)

In [8]:
sent_df = sent_df[(sent_df['sent_len'] >= 5) & (sent_df['sent_len'] <= 50)]

In [12]:
sent_df.rename(columns = {'sentences':'text'}, inplace = True)

In [13]:
sent_df

Unnamed: 0,text,source,article_id,sent_nlp,sent_len
0,"Wall Street's main indexes fell on Wednesday,...",Reuters,0,"( , Wall, Street, 's, main, indexes, fell, on,...",44
1,The techheavy Nasdaq logged a decline of over ...,Reuters,0,"(The, techheavy, Nasdaq, logged, a, decline, o...",16
2,Minutes of the Fed's March 1516 meeting showed...,Reuters,0,"(Minutes, of, the, Fed, 's, March, 1516, meeti...",28
3,Wall Street's main indexes already had been so...,Reuters,0,"(Wall, Street, 's, main, indexes, already, had...",42
5,The Dow Jones Industrial Average fell 144.67 p...,Reuters,0,"(The, Dow, Jones, Industrial, Average, fell, 1...",44
...,...,...,...,...,...
119827,"""They are clearly doing a good job making bran...",Reuters,17296,"("", They, are, clearly, doing, a, good, job, m...",41
119828,"""Operating income fell by $84 million at the A...",Reuters,17296,"("", Operating, income, fell, by, $, 84, millio...",34
119829,"At Disney's theme parks, higher guest spending...",Reuters,17296,"(At, Disney, 's, theme, parks, ,, higher, gues...",27
119831,The unit is expected to post an operating loss...,Reuters,17296,"(The, unit, is, expected, to, post, an, operat...",34


In [20]:
sent_df.to_json('reuters_sentences.json', default_handler=str, orient='table')

In [27]:
sent_df['text'].to_csv('reuters_sentences.txt', index=False, header=None, sep='\n')