Pre-text  processing to remove number of features:
1) convert all text to lowercase
2) remove any whitespace
3) remove stop-word
4) stem all words
5) lemmatize all words

In [21]:
#Load libraries
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import spacy
import textblob

In [22]:
#load data set into Pandas
df_corpus = pd.read_csv("C:/Users/Ryanw/Desktop/TFM/GoldmanSachs_2018_2022.csv")


In [None]:
#Remove Mentions '@' and "#" and URLs from tweets
import re

#Function to remove mentions
def remove_regExp(text):
    text = re.sub('@[\w]+','',text) #remove users
    text = re.sub(r"\S*https?:\S*",'', text) #remove URLs
    text = re.sub('#[\w]+','',text) #remove hastags
    return text

#Call function to clean data
df_corpus['content_clean'] = df_corpus['content_clean'].apply(lambda x: remove_regExp(str(x)))

In [23]:
# 3 Data pipline, remove white spaces, puctuation, stopwords and or currency and lemmatize words

# load spacy english large dictionary
from textblob import Word  

nlp = spacy.load('en_core_web_sm')

#Function to remove punc, stopwords, spaces, currency
def clean_pipeline(text):
    text = nlp(text)
    output = []
    for token in text:
        if not token.is_punct and not token.is_stop and not token.is_space and not token.is_currency :         
           output.append((token.lemma_.lower()))
    else: return ' '.join(output)

#Apply cleaning pipeline to content_clean
df_corpus['content_clean'] = df_corpus['content'].apply(lambda x: clean_pipeline(x))


In [25]:
#Determine the language of the tweet
#from langdetect import detect

#Function for language detection
#def language(text):
       #try:
           #lang = detect(text)
           #lang = [str(lang).split(':')[0][0:]]
           #return lang
       #except: None
    
#Assign languages to the DF
#df_corpus['Tweet_Language'] = df_corpus['content_clean'].apply(lambda x: language(x))

In [26]:
#Remove whitespaces
df_corpus['content_clean'] = df_corpus['content_clean'].apply(lambda x: x.strip())

#Add Tweet length
df_corpus['Tweet_LEN']= df_corpus['content_clean'].apply(lambda x: len(x))

#drop all blank rows
df_corpus = df_corpus[df_corpus['lang'] =='en']

In [27]:
#spellcheck
from textblob import Word

def spell_Check(text):
    output =[]
    words = text.split()

    for word in words:
        spell = Word(word).correct()
        output.append(spell)
    return ' '.join(output)

df_corpus['content_clean']= df_corpus['content_clean'].apply(lambda x: spell_Check(x)) 

In [28]:
#Wordnet sentiment analysis
from nltk.corpus import wordnet as wnet
from nltk.corpus import sentiwordnet as swnet


# TextBlob sentiment analysis

In [29]:
#Textblob sentiment analysis
from textblob import TextBlob

#make df text

def stirng_setting(text):
    transform = str(text)
    return transform

#Convert content_clean to string format
df_corpus['content_clean'] = df_corpus['content_clean'].apply(stirng_setting)

#sentiment classification: positive, neutral, negative
def sent_class(num):
    if num> 0:
        return "Positive"
    elif num < 0:
        return "Negative"
    else: return "Neutral"

#Subjectivity classification: Objective, subjective 
def sub_class(num):
    if num > 0:
        return "Subjective"
    else: return "Objective"


#sentiment calculation
def senti_calculation(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None


#Subjective or Objective
def subj_calculation(text):
    try:
        return TextBlob(text).subjectivity
    except:
        return None

#Add sentimet score to DF
df_corpus['TextBlob_Sentiment_Score'] = df_corpus['content_clean'].apply(senti_calculation)

#Add sentiment classiification to text
df_corpus['TextBlob_Score_Class'] = df_corpus.TextBlob_Sentiment_Score.apply(sent_class)

#add subjectivity score to dataframe
df_corpus["TextBlob_Subj_Score"] = df_corpus['content_clean'].apply(subj_calculation)

#Add sentiment classiification to text
df_corpus['TextBlob_Subj_Class'] = df_corpus.TextBlob_Subj_Score.apply(sub_class)

# VADER sentiment analysis testing

In [30]:
# VADER Sentiment analysis - Better analysis for social media as it has more related to puncuation and emojis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#initiate instance
vader = SentimentIntensityAnalyzer()

#VADER sentiment analysis
df_corpus['VADER_Score'] = df_corpus.content_clean.apply(lambda x: vader.polarity_scores(x))

#Vader sentiment score:
df_corpus['VADER_Score'] = df_corpus['VADER_Score'].apply(lambda d: d['compound'])

#Vader sentiment classification:
df_corpus['Vader_Score_Class'] = df_corpus.VADER_Score.apply(sent_class)


# Topic Modeling

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#vecotrize content_clean
corpus = df_corpus.content_clean.tolist()
vec = CountVectorizer()
matrix = vec.fit_transform(corpus)

#initialize LDA
lda = LatentDirichletAllocation(n_components = 2)
lda.fit(matrix)

lda.components_

array([[ 54.12143901,  86.10843367,   0.50993485, ...,   1.43608826,
          1.49582685,   1.49582685],
       [  0.87856099, 160.89156633,   2.49006515, ...,   0.56391174,
          0.50417315,   0.50417315]])

In [32]:
vec.vocabulary_

{'we': 18898,
 'indoctrinate': 9531,
 'think': 17507,
 'people': 13258,
 'need': 12189,
 'wheel': 19015,
 'moment': 11806,
 'call': 3410,
 'expert': 6956,
 'drive': 6123,
 'pump': 14129,
 'cliff': 4033,
 'assess': 2025,
 'assure': 2051,
 'indigenous': 9522,
 'bit': 2725,
 'feel': 7230,
 'powerless': 13734,
 'able': 1148,
 'remove': 14682,
 'up': 18383,
 'cap': 3474,
 'race': 14263,
 'reductions': 14526,
 'nonsense': 12364,
 'obviously': 12531,
 'work': 19239,
 'shit': 15826,
 'bag': 2286,
 'like': 10739,
 'is': 9946,
 'regardless': 14574,
 'politically': 13607,
 'current': 5076,
 'system': 17137,
 'divide': 5929,
 'amp': 1654,
 'conquer': 4480,
 'all': 1526,
 'target': 17210,
 '142': 183,
 '97': 1089,
 'met': 11544,
 '118': 128,
 '65': 901,
 '107': 78,
 '39': 695,
 'now': 12449,
 'if': 9283,
 'stock': 16658,
 'short': 15848,
 'week': 18942,
 '35': 660,
 'or': 12739,
 'anniversary': 1729,
 'black': 2751,
 'monday': 11813,
 '1987': 334,
 'possible': 13698,
 'red': 14508,
 'october': 1255

In [33]:
features = vec.get_feature_names_out()


In [34]:
for tid, topic in enumerate(lda.components_):
    print('topic: ', tid)
    print("word IDs: ", topic.argsort()[:-5:-1])
    print("words: ",[features[i] for i in topic.argsort()[:-5:-1]])
    print("prob: ", [topic[i] for i in topic.argsort()[:-5:-1]])

topic:  0
word IDs:  [ 8272 15248 11254  3343]
words:  ['golden', 'sacks', 'market', 'buy']
prob:  [2878.788304124221, 2159.240488002344, 856.5160846831951, 779.3569960619042]
topic:  1
word IDs:  [13258 11826 10739 12189]
words:  ['people', 'money', 'like', 'need']
prob:  [1806.950247127295, 1592.2861796881118, 1436.101897581055, 1234.7482090008891]


# Topic modelling with TDIDF Vectorizer

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

lda =LatentDirichletAllocation(n_components=10,topic_word_prior=.01)
vec = TfidfVectorizer()

matrix2 = vec.fit_transform(corpus)

lda.fit(matrix2)
features = vec.get_feature_names_out()

for tid, topic in enumerate(lda.components_):
    print('Topic: ',tid)
    print('Word IDs: ', topic.argsort()[:-5:-1])
    print('words: ', [features[i] for i in topic.argsort()[:-5:-1]])

Topic:  0
Word IDs:  [ 9785  3263  6487 16475]
words:  ['interesting', 'bullshit', 'empire', 'squad']
Topic:  1
Word IDs:  [ 5212 16252 10113 17965]
words:  ['david', 'solomon', 'job', 'trust']
Topic:  2
Word IDs:  [15826 10739  6263  8272]
words:  ['shit', 'like', 'eat', 'golden']
Topic:  3
Word IDs:  [11826 13170  1629 13258]
words:  ['money', 'pay', 'america', 'people']
Topic:  4
Word IDs:  [16881  8272  8542  3774]
words:  ['suck', 'golden', 'growth', 'chart']
Topic:  5
Word IDs:  [16658 11254  3774 17413]
words:  ['stock', 'market', 'chart', 'thank']
Topic:  6
Word IDs:  [ 8817  9967 16686 19266]
words:  ['heargaza', 'israel', 'stop', 'world']
Topic:  7
Word IDs:  [ 8337 19191  9946 11156]
words:  ['good', 'woman', 'is', 'man']
Topic:  8
Word IDs:  [ 2828 12286  6856  8872]
words:  ['block', 'nice', 'evil', 'hell']
Topic:  9
Word IDs:  [ 7829  8272  2728 15248]
words:  ['fuck', 'golden', 'bitcoin', 'sacks']


#clean up function
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from string import punctuation as punctuation
allowed_tags = ['VBP','VB','VBG','JJ','NN','RB']

def clean_data(text):
    output =[]
    words = nltk.tokenize.word_tokenize(text)
    words_tags = nltk.pos_tag(words)
    for w,t in words_tags:
        if w in punc:
            words.remove(w)
        elif w in ENGLISH_STOP_WORDS:
            words.remove(w)
        elif t not in allowed_tags:
            words.remove(w)
        output.append(w)
    return ' '.join(output)        

df_corpus['clean_Data'] = df_corpus['content'].apply(lambda x: clean_data(str(x)))


In [36]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_x=vec.fit_transform(df_corpus.content_clean.tolist())

features = vec.get_feature_names_out()
lda = LatentDirichletAllocation(n_components=5)
lda.fit(matrix_x)

#sentiment analysis
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

for tid, topics in enumerate(lda.components_):
    top_words = [features[i] for i in topics.argsort()[:-10:-1]]
    print(top_words)
    score = 0
    for w in top_words:
        senti_synset = swn.senti_synset(wn.synsets(w)[0].name())
        score+= senti_synset.pos_score()-senti_synset.neg_score()
    print('Sentiment Score: ',score)

['people', 'new', 'black', 'year', 'change', 'global', 'sarscov2', 'need', 'know']


IndexError: list index out of range