#### Installations and imports

In [1]:
#!pip install spacy

In [2]:
#!python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg

In [1]:
import pandas as pd
import nltk
import spacy
import csv

In [2]:
#nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_lg')

In [13]:
news = pd.read_csv('../files/sentences/c01_all_articles_raw.csv', index_col=0)

In [14]:
news.head(100)

Unnamed: 0,url,headline,raw_article
0,https://www.investing.com/news/stock-market-ne...,Wall Street ends lower after Fed minutes,"By Lewis Krauskopf, Bansari Mayur Kamdar and P..."
1,https://www.investing.com/news/stock-market-ne...,"Aeropostale shares dip 5%, amid plans to elimi...","Investing.com -- Aeropostale Inc (N:ARO), a ..."
2,https://www.investing.com/news/stock-market-ne...,MetLife plans to split off U.S. retail busines...,"(Reuters) - MetLife Inc (N:MET), the largest U..."
3,https://www.investing.com/news/stock-market-ne...,Shares in CSX fall 2% after revenue miss in fo...,Investing.com -- Shares in CSX Corporation (O:...
4,https://www.investing.com/news/stock-market-ne...,Shire says internal synergy goals from Baxalta...,(Reuters) - The chief executive of rare diseas...
...,...,...,...
95,https://www.investing.com/news/stock-market-ne...,"Wall Street loses gains as health stocks, oil ...",By Abhiram Nandakumar(Reuters) - Wall Street g...
96,https://www.investing.com/news/stock-market-ne...,Greece stocks lower at close of trade; Athens ...,Investing.com – Greece stocks were lower after...
97,https://www.investing.com/news/stock-market-ne...,Israel stocks higher at close of trade; TA 25 ...,Investing.com – Israel stocks were higher afte...
98,https://www.investing.com/news/stock-market-ne...,"Detroit's auto industry is changed, but not as...",By David Shepardson and Paul IngrassiaDETROIT ...


In [15]:
print(len(news), 'articles')

20856 articles


In [16]:
# inspect missing values

news.isna().sum()

url               0
headline          0
raw_article    3372
dtype: int64

In [17]:
# drop missing values

news = news.dropna()

#### Some generic cleaning of text...

In [18]:
def clean_up(raw_article):
    if type(raw_article) == "str":
        raw_article = raw_article.replace("–", "-")   # replace the longer slash in investing.com articles
    return raw_article

In [19]:
news['raw_article'] = news['raw_article'].apply(clean_up)

#### Extract Source (e.g., Reuters, Investing.com, IBT ...)

In [20]:
def extract_source(raw_article):
    first_sent = raw_article.split('-')[0]
    author = "Unknown"
    if "Reuters" in first_sent:
        author = "Reuters"
    elif "Investing.com" in first_sent:
        author = "Investing.com"
    elif "IBT" in first_sent:
        author = "IBT"
    return author

In [21]:
news['source'] = news['raw_article'].apply(extract_source)

In [22]:
news['source'].value_counts()

Investing.com    9348
Reuters          7043
IBT               922
Unknown           171
Name: source, dtype: int64

#### Seperate article string

In [23]:
def extract_article(raw_article):
    return "".join(raw_article.split('-')[1:])

In [24]:
news['article'] = news['raw_article'].apply(extract_article)

In [25]:
# filter out only reuters articles

news = news[news.source == 'Reuters']

In [26]:
news['doc'] = news['article'].apply(nlp)

In [27]:
def splitup_sentences(doc):
    return list(doc.sents)

In [28]:
news['sentences'] = news['doc'].apply(splitup_sentences)

#### Store sentences in their own dataset

In [29]:
sent_dict = {}

sent_dict['sentences'] = []
sent_dict['source'] = []
sent_dict['article_id'] = []

for i, row in news.iterrows():
    for sentence in row['sentences']:
        if len(sentence) >= 5 and len(sentence) <= 50:
            sent_dict['sentences'].append(str(sentence))
            sent_dict['source'].append(row['source'])
            sent_dict['article_id'].append(i)

In [30]:
sent_df = pd.DataFrame(sent_dict)

In [31]:
sent_df['sentences'][50:100]

50    The state said VW's proposed fix was "incomple...
51    It also said the proposal could not be impleme...
52    California sent VW a confidential letter offer...
53    The state said it will continue its investigat...
54    then, Volkswagen has had constructive discussi...
55    VW has admitted using software that circumvent...
56    The state did not assess any immediate penalti...
57    VW CEO Matthias Muller is meeting with U.S. En...
58    EPA said in a statement it agrees with Califor...
59     EPA has conveyed this to the company previously.
60    Connecticut Attorney General George Jepsen cal...
61    "The time for empty apologies and hollow pledg...
62    "VW officials have expressed optimism they wil...
63    They face a separate Feb. 2 deadline to submit...
64    Separately, Tennessee Gov. Bill Haslam said he...
65    "We obviously have a keen interest in getting ...
66    The company said on Tuesday the two units will...
67    Norfolk said the move will affect manageme

In [32]:
# clean up some general noise

def clean_noisy_sents(sentence):
    sentence.replace("(Reuters)", "")
    sentence.replace("(, Reuters, )", "")
    sentence.replace('\"\"\"', r'\"')
    sentence.replace('\"\"', r'\"')
    sentence.replace(",,", ",")
    sentence.replace(",,,", ",")
    sentence.replace(",  ,", ",")
    sentence.strip()
    return sentence

In [33]:
sent_df['sentences'].apply(clean_noisy_sents)

0          Wall Street's main indexes fell on Wednesday,...
1         The techheavy Nasdaq logged a decline of over ...
2         Minutes of the Fed's March 1516 meeting showed...
3         Wall Street's main indexes already had been so...
4         The Dow Jones Industrial Average fell 144.67 p...
                                ...                        
110201    "They are clearly doing a good job making bran...
110202    "Operating income fell by $84 million at the A...
110203    At Disney's theme parks, higher guest spending...
110204    The unit is expected to post an operating loss...
110205    He said the loss will be comparable to the uni...
Name: sentences, Length: 110206, dtype: object

In [34]:
sent_df.head(5)

Unnamed: 0,sentences,source,article_id
0,"Wall Street's main indexes fell on Wednesday,...",Reuters,0
1,The techheavy Nasdaq logged a decline of over ...,Reuters,0
2,Minutes of the Fed's March 1516 meeting showed...,Reuters,0
3,Wall Street's main indexes already had been so...,Reuters,0
4,The Dow Jones Industrial Average fell 144.67 p...,Reuters,0


In [35]:
len(sent_df)

110206

In [38]:
# save the extracted sentences dataframe

sent_df.to_csv('../files/sentences/c02_reuters_sentence_pool.csv')

### Sampling

- toDO: make this into a notebook for itself

In [30]:
# save a subset of 200 instances for experiments

sent_sample = sent_df.sample(n=200, random_state=42)

In [31]:
sent_sample.head()

Unnamed: 0,sentences,source,article_id
36565,WPP has made a big bet on data with its Kantar...,Reuters,7986
5061,There is no indication that Fiat Chrysler inte...,Reuters,1130
43041,"As a group, their capex plans are down by $21....",Reuters,9479
98691,"This not only costs Microsoft in lost revenue,...",Reuters,15269
92439,"""We think the 801 multiplier is disingenuous, ...",Reuters,14762


In [32]:
#sent_sample['sentences'].to_csv('reuters_sample_03.txt', index=False, header=None, sep='\n', quoting=csv.QUOTE_NONE, quotechar="",  escapechar="")

In [33]:
# save sample to csv file

sent_sample.to_csv('reuters_sample200.csv')

In [34]:
len(sent_df)

110206

In [35]:
# take previous sample out of the database
sent_df2 = sent_df.drop(sent_sample.index)

In [36]:
# get a new sample

sent_sample2 = sent_df2.sample(n=800, random_state=42)

In [37]:
sent_sample2.to_csv('reuters_sample800_r2.csv')