In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import nltk
nltk.download('stopwords')
# from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sinaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('../datasets/stock_news.csv')
df.head(3)

Unnamed: 0,stock,title,text,date,time,am_pm
0,AAPL,Morning Bid: Dollar surges after central bank ...,A look at the day ahead in U.S. and global mar...,2024-03-22,06:08,AM
1,AAPL,"Evercore says Apple sell-off is overdone, sees...",Apple (NASDAQ:AAPL) stock remains one of the v...,2024-03-11,16:54,PM
2,AAPL,US House passes bill to force ByteDance to div...,By David ShepardsonWASHINGTON (Reuters) -The U...,2024-03-13,06:01,AM


In [48]:
df.tail(3)

Unnamed: 0,stock,title,text,date,time,am_pm
4252,TSLA,Xiaomi takes aim at Tesla in Chinese auto mark...,By Sarah Wu and Yelin MoBEIJING (Reuters) - Ch...,2024-03-27,19:01,PM
4253,TSLA,Tesla appears unlikely to nix US suit alleging...,By Daniel Wiessner(Reuters) - A federal judge ...,2024-03-28,16:57,PM
4254,TSLA,"With China EV launch, Xiaomi's 'Thor' takes on...",By Sarah WuBEIJING (Reuters) -He was called Ch...,2024-03-29,07:19,AM


In [4]:
df_subset = pd.DataFrame()
df_subset['content'] = df['title'] + ' ' + df['text']
df_subset.head(3)

Unnamed: 0,content
0,Morning Bid: Dollar surges after central bank ...
1,"Evercore says Apple sell-off is overdone, sees..."
2,US House passes bill to force ByteDance to div...


In [53]:
apple_df = df_subset[:9]
tsla_df = df_subset[-9:]

In [60]:
#strip_handles removes personal information such as twitter handles, which don't
#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.
# tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)
from nltk.tokenize import word_tokenize
# pre_trained_model_ckpt = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(pre_trained_model_ckpt)
mystopwords = set(stopwords.words("english"))

#Function to tokenize tweets, remove stopwords and numbers. 
#Keeping punctuations and emoticon symbols could be relevant for this task!
def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        #Nested function that removes stopwords and digits from a list of tokens
        return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    #This return statement below uses the above function to process twitter tokenizer output further. 
    # return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]
    return [remove_stops_digits(word_tokenize(content)) for content in texts]

#df_subset contains only the three categories we chose. 
mydata = preprocess_corpus(df_subset['content'])
# mydata = preprocess_corpus(tsla_df['content'])
# mydata = preprocess_corpus(apple_df['content'])


In [65]:
#Split data into train and test, following the usual process
# train_data, test_data = train_test_split(mydata,random_state=1234)

#prepare training data in doc2vec format:
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(mydata)]
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=5)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

Model Saved


In [66]:
#Infer the feature representation for training and test data using the trained model
model= Doc2Vec.load("d2v.model")
#infer in multiple steps to get a stable representation. 
data_vector =  [model.infer_vector(list_of_tokens) for list_of_tokens in mydata]

import numpy as np
data_vector = np.array(data_vector)

In [68]:
import altair as alt
from sklearn.decomposition import PCA

def pca_2d(paragraph_matrix):
    pca = PCA(n_components=2)
    reduced_dims = pca.fit_transform(paragraph_matrix)
    print(pca.explained_variance_)
    
    print(f"2-component PCA, explains {sum(pca.explained_variance_):.2f}% of variance")
    df = pd.DataFrame(reduced_dims, columns=["x", "y"])
    df["component"] = np.arange(len(df))  # Assign a component number to each data point
    return df

example_2d = pca_2d(data_vector)
chart = alt.Chart(example_2d).mark_point().encode(
    x="x",
    y="y",
    color="component:N"  # Encode the component as a nominal color channel
)
chart.save('results/9999.html')

[23.800198 20.55851 ]
2-component PCA, explains 44.36% of variance
