<a href="https://colab.research.google.com/github/stevengregori92/LearnWord2Vec/blob/main/Financial_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install umap-learn
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
import os
import pandas as pd

from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize
from gensim.models import FastText

#Prepare Corpus

In [23]:
!gdown https://drive.google.com/uc?id=1oKcKWMyX4WZPNUo-tOhxn8mwkxaRxMqR
!unzip /content/stock-market.zip

Downloading...
From: https://drive.google.com/uc?id=1oKcKWMyX4WZPNUo-tOhxn8mwkxaRxMqR
To: /content/stock-market.zip
  0% 0.00/206k [00:00<?, ?B/s]100% 206k/206k [00:00<00:00, 31.9MB/s]
Archive:  /content/stock-market.zip
replace stock_data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [24]:
df = pd.read_csv('stock_data.csv')
df

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5786,Industry body CII said #discoms are likely to ...,-1
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1


In [25]:
sentences = [word_tokenize(Text.lower()) for Text in tqdm(df.Text)]
sentences[:5]

  0%|          | 0/5791 [00:00<?, ?it/s]

[['kickers',
  'on',
  'my',
  'watchlist',
  'xide',
  'tit',
  'soq',
  'pnk',
  'cpw',
  'bpz',
  'aj',
  'trade',
  'method',
  '1',
  'or',
  'method',
  '2',
  ',',
  'see',
  'prev',
  'posts'],
 ['user',
  ':',
  'aap',
  'movie',
  '.',
  '55',
  '%',
  'return',
  'for',
  'the',
  'fea/geed',
  'indicator',
  'just',
  '15',
  'trades',
  'for',
  'the',
  'year',
  '.',
  'awesome',
  '.'],
 ['user',
  'i',
  "'d",
  'be',
  'afraid',
  'to',
  'short',
  'amzn',
  '-',
  'they',
  'are',
  'looking',
  'like',
  'a',
  'near-monopoly',
  'in',
  'ebooks',
  'and',
  'infrastructure-as-a-service'],
 ['mnta', 'over', '12.00'],
 ['oi', 'over', '21.37']]

#Train FastText Model

In [26]:
model = FastText(sentences, vector_size=128, window=5, min_count=3, workers=4, epochs=1000, sg=0, hs=0)

#Save

In [27]:
os.makedirs("model/fasttext/", exist_ok=True)

In [28]:
model.save('model/fasttext/financial_statement.fasttext')

#Load

In [29]:
model = FastText.load('model/fasttext/financial_statement.fasttext')

#Model Information

In [30]:
fasttext = model.wv

In [31]:
fasttext.index_to_key

['.',
 ',',
 'the',
 'to',
 ':',
 'a',
 'on',
 'in',
 'of',
 'for',
 'aap',
 'and',
 'is',
 '!',
 '-',
 'https',
 'user',
 'it',
 'i',
 'at',
 'this',
 '%',
 '?',
 'short',
 'up',
 '...',
 'will',
 'from',
 'with',
 'over',
 'here',
 'today',
 ')',
 'be',
 "'s",
 '#',
 'that',
 'as',
 'out',
 'volume',
 'day',
 'are',
 'like',
 'but',
 '(',
 'long',
 'if',
 'now',
 'not',
 '&',
 'you',
 'has',
 'good',
 'stock',
 'more',
 'my',
 'some',
 'goog',
 'above',
 'new',
 'bac',
 'watch',
 'stop',
 'have',
 'still',
 'down',
 'nice',
 'we',
 'back',
 'buy',
 'next',
 'move',
 'after',
 'higher',
 'by',
 'off',
 'coronavirus',
 'so',
 'just',
 'see',
 "n't",
 'no',
 'an',
 'market',
 'ong',
 '....',
 'one',
 'sensex',
 'triangle',
 'was',
 'time',
 'week',
 'or',
 'trade',
 'stocks',
 'all',
 'close',
 'nifty',
 'do',
 ';',
 'its',
 'weekly',
 'could',
 '..',
 '@',
 'looking',
 'break',
 'big',
 'support',
 'breakout',
 'go',
 'again',
 'going',
 'nfx',
 'bullish',
 'last',
 'looks',
 'green',


In [32]:
fasttext.vectors

array([[ 1.4296336 , -1.3984237 , -1.3092039 , ..., -0.34313765,
         0.59211916,  2.5691922 ],
       [-1.6349065 , -2.4268675 ,  1.8750215 , ..., -0.19044943,
        -2.066842  ,  2.1274717 ],
       [ 0.09221601, -0.10267656,  2.2214844 , ...,  0.44309106,
         2.0613873 ,  0.8491145 ],
       ...,
       [-0.06407233, -2.0669057 , -2.9809477 , ..., -1.4433708 ,
         1.2479845 , -1.5194942 ],
       [ 0.03043609, -0.47189644,  0.49435937, ...,  0.6068292 ,
         0.16924256,  2.6082683 ],
       [-0.00414666, -0.3759518 ,  1.9122752 , ...,  1.6064292 ,
        -3.1580055 ,  0.10691867]], dtype=float32)

In [33]:
fasttext.vector_size

128

In [34]:
fasttext['like']

array([ 2.9218674 , -1.2368832 , -1.745289  ,  0.23669863,  4.051867  ,
        2.2277646 ,  1.0276012 ,  3.412448  ,  0.36770007, -2.1418076 ,
        2.7500014 ,  2.5693922 ,  0.8889851 ,  2.0212824 ,  2.4408553 ,
        2.8243487 , -1.53605   ,  0.2709373 , -1.0164429 ,  2.238883  ,
        0.31928083,  3.41229   , -4.019627  , -2.7521472 ,  2.522693  ,
        4.456901  ,  0.20838545, -3.0237463 , -1.0642396 , -3.559357  ,
        1.4130145 ,  1.4753456 , -0.80191714, -2.2517717 , -1.5980053 ,
       -0.3120053 ,  1.3962473 , -4.1019135 , -0.09790934,  1.4266835 ,
        1.5762168 ,  2.595643  , -2.242879  ,  0.60116005, -2.1719337 ,
        0.43942878, -0.87141997, -1.4810735 , -3.1534727 ,  0.87419283,
       -0.06062881, -6.3172045 , -3.0467188 ,  1.092666  , -3.6290987 ,
        1.9120644 , -0.34927776,  2.6241014 , -0.22999547, -0.6672199 ,
        0.38152555, -1.5476975 , -1.0260456 , -0.7975958 ,  1.9225408 ,
       -0.7281983 , -0.60203207, -3.358653  ,  3.9328222 , -2.53

#Sanity Check

similar word

In [35]:
fasttext.similar_by_word('stock', topn=5)

[('share', 0.3295898735523224),
 ('right', 0.3203554153442383),
 ('soon', 0.31961461901664734),
 ('gone', 0.302438884973526),
 ('that', 0.29957103729248047)]

higher order visualization

In [36]:
from umap import UMAP
import numpy as np
import pandas as pd
import plotly.express as px

In [37]:
X = UMAP().fit_transform(fasttext.vectors)

In [38]:
df = pd.DataFrame(X, columns = ['umap1', 'umap2'])
df['text'] = fasttext.index_to_key

In [39]:
df

Unnamed: 0,umap1,umap2,text
0,1.823165,2.746266,.
1,1.999296,2.827330,","
2,2.000035,2.435148,the
3,1.863061,2.686481,to
4,3.943307,1.375477,:
...,...,...,...
3267,4.595441,-0.449213,purchases
3268,1.146605,0.676178,doubled
3269,1.565851,-1.127857,ops
3270,1.093157,-2.184446,86


In [40]:
fig = px.scatter(df, x='umap1', y='umap2', text='text')
fig.update_traces(textposition='top center')
fig.update_layout(
    height = 800,
    title_text = 'Reduced FastText Visualization'
)
fig.show()