In [24]:
import pandas as pd
import numpy as np
import re

In [25]:
dj = pd.read_csv(r"D:\machine learning intern\stocknews\stocknews\upload_DJIA_table.csv")
news = pd.read_csv(r"D:\machine learning intern\stocknews\stocknews\RedditNews.csv")

In [26]:
dj.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,7/1/2016,17924.24023,18002.38086,17916.91016,17949.36914,82160000,17949.36914
1,6/30/2016,17712.75977,17930.60938,17711.80078,17929.99023,133030000,17929.99023
2,6/29/2016,17456.01953,17704.50977,17456.01953,17694.67969,106380000,17694.67969
3,6/28/2016,17190.50977,17409.7207,17190.50977,17409.7207,112190000,17409.7207
4,6/27/2016,17355.21094,17355.21094,17063.08008,17140.24023,138740000,17140.24023


In [27]:
news.head()

Unnamed: 0,Date,News
0,7/1/2016,A 117-year-old woman in Mexico City finally re...
1,7/1/2016,IMF chief backs Athens as permanent Olympic host
2,7/1/2016,"The president of France says if Brexit won, so..."
3,7/1/2016,British Man Who Must Give Police 24 Hours' Not...
4,7/1/2016,100+ Nobel laureates urge Greenpeace to stop o...


In [28]:
dj.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
dtype: int64

In [29]:
news.isnull().sum()

Date    0
News    0
dtype: int64

In [30]:
print(dj.shape)
print(news.shape)

(1989, 7)
(73608, 2)


In [31]:
# Compare the number of unique dates. We want matching values.
print(len(set(dj.Date)))
print(len(set(news.Date)))

1989
2943


In [32]:
# Remove the extra dates that are in news
news = news[news.Date.isin(dj.Date)]

In [33]:
print(len(set(dj.Date)))
print(len(set(news.Date)))

1989
1989


In [34]:
# Calculate the difference in opening prices between the following and current day.
# The model will try to predict how much the Open value will change beased on the news.
dj = dj.set_index('Date').diff(periods=1)
dj['Date'] = dj.index
dj = dj.reset_index(drop=True)
# Remove unneeded features
dj = dj.drop(['High','Low','Close','Volume','Adj Close'], 1)

In [35]:
dj.head()

Unnamed: 0,Open,Date
0,,7/1/2016
1,-211.48046,6/30/2016
2,-256.74024,6/29/2016
3,-265.50976,6/28/2016
4,164.70117,6/27/2016


In [36]:
# Remove top row since it has a null value.
dj = dj[dj.Open.notnull()]

In [37]:
# Check if there are any more null values.
dj.isnull().sum()

Open    0
Date    0
dtype: int64

In [38]:
# Create a list of the opening prices and their corresponding daily headlines from the news
price = []
headlines = []

for row in dj.iterrows():
    daily_headlines = []
    date = row[1]['Date']
    price.append(row[1]['Open'])
    for row_ in news[news.Date==date].iterrows():
        daily_headlines.append(row_[1]['News'])
    
    # Track progress
    headlines.append(daily_headlines)
    if len(price) % 500 == 0:
        print(len(price))

500
1000
1500


In [39]:
# Compare lengths to ensure they are the same
print(len(price))
print(len(headlines))

1988
1988


In [40]:
# Compare the number of headlines for each day
print(max(len(i) for i in headlines))
print(min(len(i) for i in headlines))

25
22


In [41]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
    "mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
    "who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [42]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'0,0', '00', text) 
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'u s ', ' united states ', text)
    text = re.sub(r'u n ', ' united nations ', text)
    text = re.sub(r'u k ', ' united kingdom ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    text = re.sub(r'0km ', '0 km ', text)
      
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [43]:
# Clean the headlines
clean_headlines = []

for daily_headlines in headlines:
    clean_daily_headlines = []
    for headline in daily_headlines:
        clean_daily_headlines.append(clean_text(headline))
    clean_headlines.append(clean_daily_headlines)

NameError: name 'stopwords' is not defined