In [100]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

## Data Processing

In [94]:
df = pd.read_csv('stock_data.csv')

In [11]:
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [12]:
# Check the number of positive and negative sentiments
df['Sentiment'].value_counts() # more positive than negative

Sentiment
 1    3685
-1    2106
Name: count, dtype: int64

In [23]:
# Download stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gareth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
# Define function to process the sentences
def process_text(text):
    
    cleaned = text.lower()

    # Remove punctuations
    cleaned = re.sub('[^a-zA-Z]', ' ', cleaned)

    # Single character removal
    cleaned = re.sub(r"\s+[a-zA-Z]\s+", ' ', cleaned)

    # Remove multiple spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)

    # Remove stopwords
    '''
    remove words that signify direction them from the list of stopwords such as:
    ['above', 'below', 'up', 'down', 'over', 'under']
    '''
    stop_words = [word for word in stopwords.words('english') if word not in ['above', 'below', 'up', 'down', 'over', 'under']] 
    pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*') 
    cleaned = pattern.sub('', cleaned)

    return cleaned

In [91]:
tx = "user I'd be afraid to short AMZN - and they are looking like a near-monopoly in eBooks and infrastructure-as-a-service"
process_text(tx)

'user afraid short amzn under looking like near monopoly ebooks infrastructure service'

In [95]:
df['Text'] = df['Text'].apply(lambda x: process_text(x))

In [98]:
df.head()

Unnamed: 0,Text,Sentiment
0,kickers watchlist xide tit soq pnk cpw bpz aj ...,1
1,user aap movie return fea geed indicator trade...,1
2,user afraid short amzn looking like near monop...,1
3,mnta over,1
4,oi over,1


In [101]:
X = df['Text']
y = df['Sentiment']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4632,), (1159,), (4632,), (1159,))

### Word Embedding

word2vec