In [34]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy import stats
import category_encoders as cat
from sklearn.preprocessing import OneHotEncoder

### Providing New Column Headers

In [35]:
cols = ['unit_id', 'golden', 'state', 'trusted_judgements', 'latest_judgement', 'sentiment', 'confidence',
        'date', 'id', 'query', 'sentiment_gold', 'text']

In [36]:
rawData = open("Datasets/train_apple.csv", encoding = 'latin-1').read()
rawData[0:500]

'_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,date,id,query,sentiment_gold,text\n623495513,TRUE,golden,10,,3,0.6264,Mon Dec 01 19:30:03 +0000 2014,5.40E+17,#AAPL OR @Apple,"3\nnot_relevant",#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx\n623495514,TRUE,golden,12,,3,0.8129,Mon Dec 01 19:43:51 +0000 2014,5.40E+17,#AAPL OR @Apple,"3\n1","RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl\nhttp://t.co/hGFcjYa0E9"\n6'

### Reading the file

In [37]:
pd.set_option('display.max_colwidth', 100)

trainCol = pd.read_csv('Datasets/train_apple.csv',names = cols, encoding='latin-1', header = None)
testCol = pd.read_csv('Datasets/test_apple.csv', names = cols, encoding='latin-1')

In [38]:
x_mod = trainCol

In [39]:
x_mod[0:3]

Unnamed: 0,unit_id,golden,state,trusted_judgements,latest_judgement,sentiment,confidence,date,id,query,sentiment_gold,text
0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,date,id,query,sentiment_gold,text
1,623495513,TRUE,golden,10,,3,0.6264,Mon Dec 01 19:30:03 +0000 2014,5.40E+17,#AAPL OR @Apple,3\nnot_relevant,#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx
2,623495514,TRUE,golden,12,,3,0.8129,Mon Dec 01 19:43:51 +0000 2014,5.40E+17,#AAPL OR @Apple,3\n1,RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl\nhttp://t.co/hGFcjYa0E9


### Dropping columns which are not required

In [40]:
x_mod = x_mod.drop(['unit_id', 'golden', 'state', 'trusted_judgements', 'latest_judgement', 'confidence',
      'date', 'id', 'sentiment_gold', 'query','sentiment'], axis = 1)

In [41]:
x_mod.shape

(3199, 1)

In [42]:
x_mod.drop([0])

Unnamed: 0,text
1,#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx
2,RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl\nhttp://t.co/hGFcjYa0E9
3,My cat only chews @apple cords. Such an #AppleSnob.
4,"I agree with @jimcramer that the #IndividualInvestor should own not trade #Apple #AAPL, it's ext..."
5,Nobody expects the Spanish Inquisition #AAPL
...,...
3194,"The 10 biggest differences between #Mac and #PC. Yep, I'll stick with my Mac! #apple http://t.c..."
3195,New Hearthstone Expansion 'Goblins vs Gnomes' Officially Launches [iOS Blog] http://t.co/LGJGwH1...
3196,"Happy Monday! My camera on my fancy @Apple #iPhone6Plus suddenly stopped working this weekend, s..."
3197,Apple Inc. and Facebook Clash; Neither Is Wrong http://t.co/ELe0YAQdQO #AAPL


In [47]:
x_mod.iloc[3]

text    My cat only chews @apple cords. Such an #AppleSnob.
Name: 3, dtype: object

In [48]:
import re

findings01 = re.findall('\S+',str(x_mod.iloc[3]))

In [49]:
findings01

['text',
 'My',
 'cat',
 'only',
 'chews',
 '@apple',
 'cords.',
 'Such',
 'an',
 '#AppleSnob.',
 'Name:',
 '3,',
 'dtype:',
 'object']

### Remove Punctuation

In [50]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [51]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

x_mod['text_clean'] = x_mod['text'].apply(lambda x: remove_punct(x))

x_mod.head()

Unnamed: 0,text,text_clean
0,text,text
1,#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx,AAPLThe 10 best Steve Jobs emails everhttptco82G1kL94tx
2,RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl\nhttp://t.co/hGFcjYa0E9,RT JPDesloges Why AAPL Stock Had a MiniFlash Crash Today AAPL aapl\nhttptcohGFcjYa0E9
3,My cat only chews @apple cords. Such an #AppleSnob.,My cat only chews apple cords Such an AppleSnob
4,"I agree with @jimcramer that the #IndividualInvestor should own not trade #Apple #AAPL, it's ext...",I agree with jimcramer that the IndividualInvestor should own not trade Apple AAPL its extended ...


### Tokenization 

In [52]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

x_mod['text_tokenized'] = x_mod['text_clean'].apply(lambda x: tokenize(x.lower()))

x_mod.head()

Unnamed: 0,text,text_clean,text_tokenized
0,text,text,[text]
1,#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx,AAPLThe 10 best Steve Jobs emails everhttptco82G1kL94tx,"[aaplthe, 10, best, steve, jobs, emails, everhttptco82g1kl94tx]"
2,RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl\nhttp://t.co/hGFcjYa0E9,RT JPDesloges Why AAPL Stock Had a MiniFlash Crash Today AAPL aapl\nhttptcohGFcjYa0E9,"[rt, jpdesloges, why, aapl, stock, had, a, miniflash, crash, today, aapl, aapl, httptcohgfcjya0e9]"
3,My cat only chews @apple cords. Such an #AppleSnob.,My cat only chews apple cords Such an AppleSnob,"[my, cat, only, chews, apple, cords, such, an, applesnob]"
4,"I agree with @jimcramer that the #IndividualInvestor should own not trade #Apple #AAPL, it's ext...",I agree with jimcramer that the IndividualInvestor should own not trade Apple AAPL its extended ...,"[i, agree, with, jimcramer, that, the, individualinvestor, should, own, not, trade, apple, aapl,..."


### Removing Stopwords

In [53]:
import nltk

stopword = nltk.corpus.stopwords.words('english')

In [54]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

x_mod['text_nostop'] = x_mod['text_tokenized'].apply(lambda x: remove_stopwords(x))

x_mod.head()

Unnamed: 0,text,text_clean,text_tokenized,text_nostop
0,text,text,[text],[text]
1,#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx,AAPLThe 10 best Steve Jobs emails everhttptco82G1kL94tx,"[aaplthe, 10, best, steve, jobs, emails, everhttptco82g1kl94tx]","[aaplthe, 10, best, steve, jobs, emails, everhttptco82g1kl94tx]"
2,RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl\nhttp://t.co/hGFcjYa0E9,RT JPDesloges Why AAPL Stock Had a MiniFlash Crash Today AAPL aapl\nhttptcohGFcjYa0E9,"[rt, jpdesloges, why, aapl, stock, had, a, miniflash, crash, today, aapl, aapl, httptcohgfcjya0e9]","[rt, jpdesloges, aapl, stock, miniflash, crash, today, aapl, aapl, httptcohgfcjya0e9]"
3,My cat only chews @apple cords. Such an #AppleSnob.,My cat only chews apple cords Such an AppleSnob,"[my, cat, only, chews, apple, cords, such, an, applesnob]","[cat, chews, apple, cords, applesnob]"
4,"I agree with @jimcramer that the #IndividualInvestor should own not trade #Apple #AAPL, it's ext...",I agree with jimcramer that the IndividualInvestor should own not trade Apple AAPL its extended ...,"[i, agree, with, jimcramer, that, the, individualinvestor, should, own, not, trade, apple, aapl,...","[agree, jimcramer, individualinvestor, trade, apple, aapl, extended, todays, pullback, good, see]"
