In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv("/kaggle/input/sandp500/all_stocks_5yr.csv")
df.head(100)

In [None]:
company_df = pd.read_csv("/kaggle/input/sp-500-companies-with-financial-information/financials.csv")

company_df.head()

In [None]:
merged_df = df.merge(company_df, left_on = "Name", right_on = "Symbol")
merged_df.head()

In [None]:
merged_df.isnull().sum()

In [None]:
df.isnull().sum()

## Sentence-Level Data Preparation

With each date indicating a sentence

In [None]:
DATES = merged_df.date.unique()

In [None]:
merged_df["close_minus_open"]=(merged_df.close-merged_df.open)/merged_df.open
merged_df.head(10)

In [None]:
data_1day = merged_df[merged_df.date==DATES[0]].sort_values("close_minus_open", ascending = False)
data_1day

In [None]:
sent_1day = " ".join(data_1day.Name_x)
sent_1day

In [None]:
SENTENCES = []

for date in tqdm(DATES):
    data_1day = merged_df[merged_df.date==date].sort_values("close_minus_open", ascending = False)
    sent_1day = " ".join(data_1day.Name_x)
    
    SENTENCES.append(sent_1day)

## Model Training

In [None]:
import gensim.models

In [None]:
corpus = [s.split() for s in SENTENCES]
corpus

In [None]:
model = gensim.models.Word2Vec(sentences=corpus, vector_size = 150, epochs = 100, window = 10)

In [None]:
model.save("/kaggle/working/model")
# new_model = gensim.models.Word2Vec.load(temporary_filepath)

In [None]:
# Most similar stocks to JNJ (based on daily behaviour)

model.wv.similar_by_word("JNJ", topn = 10)

In [None]:
# Least similar stocks to JNJ (based on daily behaviour)

model.wv.similar_by_word("JNJ", topn = 505)[-10:]

## Validation

In [None]:
import matplotlib.pyplot as plt

In [None]:
## Most similar to JNJ from Stock2Vec

dates = df[df.Name=="JNJ"].date
data1 = df[df.Name=="JNJ"].open
data2 = df[df.Name=="PM"].open

plt.plot(dates, data1)
plt.plot(dates, data2)

In [None]:
## Least similar to JNJ from Stock2Vec

dates = df[df.Name=="JNJ"].date
data1 = df[df.Name=="JNJ"].open
data2 = df[df.Name=="APA"].open

plt.plot(dates, data1)
plt.plot(dates, data2)

## Visualisation of Stock2Vec (This part is from a website)

In [None]:
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import plotly.express as px

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


In [None]:
def reduce_dimensions(model, num_dimensions = 2):
    # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)
    
    if num_dimensions==2:
        x_vals = [v[0] for v in vectors]
        y_vals = [v[1] for v in vectors]
        return x_vals, y_vals, labels
    
    elif num_dimensions==3:
        x_vals = [v[0] for v in vectors]
        y_vals = [v[1] for v in vectors]
        z_vals = [v[2] for v in vectors]
        return x_vals, y_vals, z_vals, labels

In [None]:
x_vals, y_vals, labels = reduce_dimensions(model)

display_df = {}
display_df["x"] = x_vals
display_df["y"] = y_vals
display_df["symbols"] = labels
display_df["names"] = [list(company_df.loc[company_df['Symbol'] == l, 'Name'])[0] for l in labels]
display_df["sectors"] = [list(company_df.loc[company_df['Symbol'] == l, 'Sector'])[0] for l in labels]

display_df = pd.DataFrame(display_df)
display_df.head()

In [None]:
fig = px.scatter(display_df, x="x", y="y", color="sectors", 
                 hover_data=['symbols','names', "sectors"])
fig.show()

fig.write_html('/kaggle/working/stock-embedding-plot-2d.html')
# symbol = "symbols",#size='Price/Earnings',

In [None]:
x_vals, y_vals, z_vals, labels = reduce_dimensions(model, num_dimensions=3)

display_df = {}
display_df["x"] = x_vals
display_df["y"] = y_vals
display_df["z"] = z_vals
display_df["size"] = 5
display_df["symbols"] = labels
display_df["names"] = [list(company_df.loc[company_df['Symbol'] == l, 'Name'])[0] for l in labels]
display_df["sectors"] = [list(company_df.loc[company_df['Symbol'] == l, 'Sector'])[0] for l in labels]

display_df = pd.DataFrame(display_df)
display_df.head()

In [None]:
fig = px.scatter_3d(display_df, x="x", y="y", z= 'z', color="sectors", 
                 hover_data=['symbols','names', "sectors"], size = 'size')
fig.show()

fig.write_html('/kaggle/working/stock-embedding-plot-3d.html')
# symbol = "symbols",#size='Price/Earnings',