In [1]:
import pandas as pd
import numpy as np
import re
# pip install demoji==1.1.0
import demoji

In [2]:
data = pd.read_csv("./data/processed.csv", usecols=["message", "sentiment"])
data.head(3)

Unnamed: 0,sentiment,message
0,Bullish,$AAPL $SPY $TQQQ $SOXL $AMZN \n🤑 V shape recov...
1,Bullish,$AAPL $SPY $AMZN $TQQQ $SOXL \nApple Is Seeing...
2,Bullish,$AMZN $SPY $AAPL $GOOGL $TQQQ \n🎉 Wow~ Amazon&...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4102 entries, 0 to 4101
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4102 non-null   object
 1   message    4102 non-null   object
dtypes: object(2)
memory usage: 64.2+ KB


In [4]:
# open the STOPWORDS file in read mode
with open(r'./data/stopwords.txt', 'r') as fp:
    STOPWORDS = [i.strip() for i in fp.readlines()]

In [5]:
STOPWORDS[:5]

['be', 'this', 'hereupon', 'indeed', 'became']

In [7]:
sample = data["message"].loc[0]
sample

'$AAPL $SPY $TQQQ $SOXL $AMZN \n🤑 V shape recovery Monday. 🚀🚀\n\nAnalyst Price Target on AAPL\nBased on 27 Wall Street analysts offering 12 month price targets for Apple in the last 3 months. The average price target is $187.22 with a high forecast of $210.00 and a low forecast of $157.00. The average price target represents a 36.53% change from the last price of $137.13\n\nhttps://www.tipranks.com/stocks/aapl/forecast'

In [8]:
# Demojisation: convert emoji to corresponding description
def demojise(text):
    return demoji.replace_with_desc(text, sep = " ")

print(demojise(sample))

$AAPL $SPY $TQQQ $SOXL $AMZN 
 money-mouth face  V shape recovery Monday.  rocket  rocket 

Analyst Price Target on AAPL
Based on 27 Wall Street analysts offering 12 month price targets for Apple in the last 3 months. The average price target is $187.22 with a high forecast of $210.00 and a low forecast of $157.00. The average price target represents a 36.53% change from the last price of $137.13

https://www.tipranks.com/stocks/aapl/forecast


In [9]:
# remove urls
def remove_urls(text):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

print(remove_urls(sample))

$AAPL $SPY $TQQQ $SOXL $AMZN 
🤑 V shape recovery Monday. 🚀🚀

Analyst Price Target on AAPL
Based on 27 Wall Street analysts offering 12 month price targets for Apple in the last 3 months. The average price target is $187.22 with a high forecast of $210.00 and a low forecast of $157.00. The average price target represents a 36.53% change from the last price of $137.13




In [11]:
# remove companies tags
def remove_tags(text):
    all_tags_with_dollar_sign = re.findall(r"\$\w+", text)
    filtered_tags = []
    for i in all_tags_with_dollar_sign:
        try:
            converttoint = int(i[1:])
        except:
            filtered_tags.append(i)
    return " ".join([i for i in text.split() if i not in filtered_tags])

print(remove_tags(sample))

🤑 V shape recovery Monday. 🚀🚀 Analyst Price Target on AAPL Based on 27 Wall Street analysts offering 12 month price targets for Apple in the last 3 months. The average price target is $187.22 with a high forecast of $210.00 and a low forecast of $157.00. The average price target represents a 36.53% change from the last price of $137.13 https://www.tipranks.com/stocks/aapl/forecast


In [12]:
# remove stopwords
def remove_stopwords(text):
    return " ".join([i for i in text.split() if i.lower() not in STOPWORDS])

print(remove_stopwords(sample))

$AAPL $SPY $TQQQ $SOXL $AMZN 🤑 V shape recovery Monday. 🚀🚀 Analyst Price Target AAPL Based 27 Wall Street analysts offering 12 month price targets Apple 3 months. average price target $187.22 high forecast $210.00 low forecast $157.00. average price target represents 36.53% change price $137.13 https://www.tipranks.com/stocks/aapl/forecast


In [13]:
# remove punctuations
def remove_puncts(text):
    return re.sub(r'[^\w\s]', " ", text)

print(remove_puncts(sample))

 AAPL  SPY  TQQQ  SOXL  AMZN 
  V shape recovery Monday    

Analyst Price Target on AAPL
Based on 27 Wall Street analysts offering 12 month price targets for Apple in the last 3 months  The average price target is  187 22 with a high forecast of  210 00 and a low forecast of  157 00  The average price target represents a 36 53  change from the last price of  137 13

https   www tipranks com stocks aapl forecast


In [14]:
# remove numerical signs
def remove_nums(text):
    return " ".join([i for i in text.split() if not i.isdigit()])

print(remove_nums(remove_puncts(sample)))

AAPL SPY TQQQ SOXL AMZN V shape recovery Monday Analyst Price Target on AAPL Based on Wall Street analysts offering month price targets for Apple in the last months The average price target is with a high forecast of and a low forecast of The average price target represents a change from the last price of https www tipranks com stocks aapl forecast


In [16]:
def text_clean(text: str):
    text = demojise(text)
    text = remove_urls(text)
    text = remove_tags(text)
    text = remove_stopwords(text)
    text = remove_puncts(text)
    text = remove_nums(text)
    return text.lower()

In [17]:
text_clean(sample)

'money mouth face v shape recovery monday rocket rocket analyst price target aapl based wall street analysts offering month price targets apple months average price target high forecast low forecast average price target represents change price'

In [18]:
# clean all messages
data["cleaned_messsage"] = data["message"].apply(text_clean)

In [20]:
data.head(2)

Unnamed: 0,sentiment,message,cleaned_messsage
0,Bullish,$AAPL $SPY $TQQQ $SOXL $AMZN \n🤑 V shape recov...,money mouth face v shape recovery monday rocke...
1,Bullish,$AAPL $SPY $AMZN $TQQQ $SOXL \nApple Is Seeing...,apple seeing strong china demand iphones analy...


In [21]:
data.loc[2]["message"], data.loc[2]["cleaned_messsage"], data.loc[2]["sentiment"]

('$AMZN $SPY $AAPL $GOOGL $TQQQ \n🎉 Wow~ Amazon&#39; new PT: $180\nMarket turn around Monday 🥳\n\nhttps://www.google.com/amp/s/pulse2.com/amazon-amzn-stock-price-180-target-and-buy-rating/amp/',
 'party popper wow amazon new pt market turn monday partying face',
 'Bullish')

In [22]:
def count_takens(text):
    return len(text.split())

In [23]:
data["word_count"] = data["cleaned_messsage"].apply(count_takens)

In [24]:
data.head(2)

Unnamed: 0,sentiment,message,cleaned_messsage,word_count
0,Bullish,$AAPL $SPY $TQQQ $SOXL $AMZN \n🤑 V shape recov...,money mouth face v shape recovery monday rocke...,36
1,Bullish,$AAPL $SPY $AMZN $TQQQ $SOXL \nApple Is Seeing...,apple seeing strong china demand iphones analy...,9


In [25]:
data["word_count"].describe()

count    4102.000000
mean       13.448805
std        15.155093
min         0.000000
25%         4.000000
50%         9.000000
75%        17.000000
max       155.000000
Name: word_count, dtype: float64

In [26]:
cleaned_data = data[data["word_count"]>3].copy(deep=True)
cleaned_data.reset_index(0, drop=True, inplace=True)

In [27]:
cleaned_data.describe(include="all")

Unnamed: 0,sentiment,message,cleaned_messsage,word_count
count,3240,3240,3240,3240.0
unique,2,2914,2900,
top,Bullish,$RS Twits Stats Today&#39;s Change 9% + 🚀 http...,tweet stats today s change rocket https t8sk c...,
freq,2386,7,7,
mean,,,,16.563272
std,,,,15.630705
min,,,,4.0
25%,,,,7.0
50%,,,,11.0
75%,,,,20.0


In [28]:
# duplicates
print(cleaned_data.duplicated().sum())

# remove duplicates
cleaned_data.drop_duplicates(inplace=True, ignore_index=True)

326


In [29]:
cleaned_data.groupby("sentiment").agg({"sentiment": "count"})

Unnamed: 0_level_0,sentiment
sentiment,Unnamed: 1_level_1
Bearish,771
Bullish,2143


In [31]:
import nltk
# nltk.download('wordnet')

# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(i) for i in text.split()])

In [32]:
cleaned_data["lem_clean_messsage"] = cleaned_data["cleaned_messsage"].apply(lemmatize_text)

In [33]:
cleaned_data.head()

Unnamed: 0,sentiment,message,cleaned_messsage,word_count,lem_clean_messsage
0,Bullish,$AAPL $SPY $TQQQ $SOXL $AMZN \n🤑 V shape recov...,money mouth face v shape recovery monday rocke...,36,money mouth face v shape recovery monday rocke...
1,Bullish,$AAPL $SPY $AMZN $TQQQ $SOXL \nApple Is Seeing...,apple seeing strong china demand iphones analy...,9,apple seeing strong china demand iphones analy...
2,Bullish,$AMZN $SPY $AAPL $GOOGL $TQQQ \n🎉 Wow~ Amazon&...,party popper wow amazon new pt market turn mon...,11,party popper wow amazon new pt market turn mon...
3,Bullish,$SPY $GOOGL $AMZN $AAPL $TQQQ\nWow~~ Google PT...,wow google pt goldman sachs confetti ball part...,15,wow google pt goldman sachs confetti ball part...
4,Bearish,$BTC.X $SPY $DWAC $AAPL $TSLA \n\nNever undere...,btc x underestimate clown s clown face ability...,10,btc x underestimate clown s clown face ability...


In [34]:
cleaned_data.loc[6]["cleaned_messsage"], cleaned_data.loc[6]["lem_clean_messsage"]

('bulls monday tuesday fade fomc opex', 'bull monday tuesday fade fomc opex')

In [102]:
# cleaned_data.rename(columns={"lem_clean_messsage": "tweet"}, inplace=True)

# cleaned_data.to_csv("./data/cleaned_data.csv", columns=["tweet", "sentiment"], index=False)