# NLP Preprocessing

In [67]:
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_tweet(tweet, lemmatizer):
    """Apply NLP preprocessing to a given tweet, preserving stock tickers and URLs.

    Args:
        tweet (str): The tweet to be preprocessed.
        lemmatizer (WordNetLemmatizer): The lemmatizer to be used.
    """
    # Define patterns for stock tickers and URLs
    ticker_pattern = r'\$\w+'
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    mention_pattern = r'@\w+'
    
    # Find all stock tickers and URLs
    tickers = re.findall(ticker_pattern, tweet)
    urls = re.findall(url_pattern, tweet)
    mentions = re.findall(mention_pattern, tweet)
    
    # Replace tickers, URLs and mentions with placeholders and keep a map for restoration
    ticker_placeholder_map = {}
    url_placeholder_map = {}
    mentions_placeholder_map = {}
    for i, ticker in enumerate(tickers):
        placeholder = f"__ticker{i}__"
        ticker_placeholder_map[placeholder] = ticker
        tweet = tweet.replace(ticker, placeholder)
    
    for i, url in enumerate(urls):
        placeholder = f"__url{i}__"
        url_placeholder_map[placeholder] = url
        tweet = tweet.replace(url, placeholder)
        
    for i, mention in enumerate(mentions):
        placeholder = f"__mention{i}__"
        mentions_placeholder_map[placeholder] = mention
        tweet = tweet.replace(mention, placeholder)
        
    # Convert to lower case
    tweet = tweet.lower()

    # Tokenize tweet
    tweet_tokens = word_tokenize(tweet)
    
    # Remove stopwords
    tweet_tokens = [word for word in tweet_tokens if word not in stopwords.words('english')]
    
    # Remove punctuation from tokens that are not placeholders
    extended_punctuation = string.punctuation + '“”‘’—'
    tweet_tokens = [word for word in tweet_tokens 
                    if word in ticker_placeholder_map 
                        or word in url_placeholder_map 
                        or word in mentions_placeholder_map
                        or all(char not in extended_punctuation for char in word)]
    
    # Perform lemmatization on tokens that are not placeholders
    tweet_tokens = [lemmatizer.lemmatize(word) 
                        if word not in ticker_placeholder_map 
                            and word not in url_placeholder_map 
                            and word not in mentions_placeholder_map
                        else word for word in tweet_tokens]

    # Restore stock tickers and URLs from placeholders
    final_tokens = []
    for token in tweet_tokens:
        if token in ticker_placeholder_map:
            final_tokens.append(ticker_placeholder_map[token])
        elif token in url_placeholder_map:
            final_tokens.append(url_placeholder_map[token])
        elif token in mentions_placeholder_map:
            final_tokens.append(mentions_placeholder_map[token])
        else:
            final_tokens.append(token)

    return final_tokens

In [69]:
import numpy as np
import pandas as pd
import nltk

# Load dataset
df = pd.read_csv('./stockerbot-export.csv', on_bad_lines='skip')
df = df.head(1000)

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Apply preprocessing to the 'text' column
df['text'] = df['text'].apply(lambda tweet: preprocess_tweet(tweet, lemmatizer))

# Display the preprocessed text
pd.set_option('display.max_colwidth', None)
display(df.head(20))

[nltk_data] Downloading package wordnet to /Users/seby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/seby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1019696670777503700,"[video, office, minding, business, –david, solomon, tell, $GS, intern, learned, wa…, https://t.co/QClAITywXV]",Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777503745,True
1,1019709091038548000,"[price, lumber, $LB_F, 22, since, hitting, ytd, high, macy, $M, turnaround, still, https://t.co/XnKsV4De39]",Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038547968,True
2,1019711413798035500,"[say, american, dream, dead, https://t.co/CRgx19x7sA]",Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1019716662587740200,"[barry, silbert, extremely, optimistic, bitcoin, predicts, 99, new, crypto, entrant, going, zero…, https://t.co/mGMVo2cZgY]",Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587740160,True
4,1019718460287389700,"[satellite, avoid, attack, space, junk, circling, earth, https://t.co/aHzIV3Lqp5, paid, @Oracle, https://t.co/kacpqZWiDJ]",Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True
5,1019719465095790600,"[david, butler, favorite, fang, stock, realmoneysod, alphabet, facebook, https://t.co/MczAPSFjOi]",Wed Jul 18 23:04:00 +0000 2018,jimcramer,FB-GOOGL-GOOG,Facebook*Alphabet*Alphabet,http://bit.ly/2NrYxje,True
6,1019720209786114000,"[miss, convo, one, favorite, thinker, @SamHarrisOrg, https://t.co/uuPVxIobCh]",Wed Jul 18 23:06:58 +0000 2018,ianbremmer,HRS,Harris,https://twitter.com/samharrisorg/status/1019719376348434433,True
7,1019720659738480600,"[intelligence, document, nelson, mandela, made, public, https://t.co/XTnEfo1rO6, https://t.co/V8DXkWDQ6R]",Wed Jul 18 23:08:45 +0000 2018,Reuters,INTC-USB,Intel*U.S.,https://reut.rs/2O0ypNf,True
8,1019720723441635300,"[senate, want, emergency, alert, go, netflix, spotify, etc, https://t.co/23yy3whBlc, @grg]",Wed Jul 18 23:09:00 +0000 2018,TechCrunch,NFLX,Netflix,https://tcrn.ch/2L8DsgT,True
9,1019721145396887600,"[hedge, fund, manager, marc, larsy, say, bitcoin, $40K, possible, https://t.co/54uPe0OWqT]",Wed Jul 18 23:10:41 +0000 2018,MarketWatch,BTC,Bitcoin,https://on.mktw.net/2Ntr7k9,True
