# Experiment -4 Preprocessing Tweets

Importing Libraries

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# To pre-process text
import re

import pandas as pd

Importing DATA

In [2]:
df = pd.read_csv('elon_musk_tweets.csv')
df

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet
0,1544379368478212100,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240855,115,13503,True,2022-07-05 17:55:09+00:00,@BillyM2k I find the gold toe sock – inevitabl...,,Twitter for iPhone,335,6542,False
1,1544377493263720450,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:47:42+00:00,"Sock Con, the conference for socks",,Twitter for iPhone,1451,30753,False
2,1544377130590552064,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:46:15+00:00,Always something new for the magazine cover an...,,Twitter for iPhone,1284,28610,False
3,1544375575724400645,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:40:05+00:00,@ExplainThisBob This guy gets it,,Twitter for iPhone,131,3640,False
4,1544375148605853699,Elon Musk,,"Mars & Cars, Chips & Dips",2009-06-02 20:12:29+00:00,101240806,115,13503,True,2022-07-05 17:38:23+00:00,Sock tech is so advanced that you can get pret...,,Twitter for iPhone,1191,23790,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3536,1628998457645047808,Elon Musk,,,2009-06-02 20:12:29+00:00,130663340,182,19120,True,2023-02-24 06:01:12+00:00,Check out my spicy 🌶️ 🔥 OnlyGANs!! https://t.c...,,Twitter for iPhone,19900,243266,False
3537,1628964700997267457,Elon Musk,,,2009-06-02 20:12:29+00:00,130663340,182,19120,True,2023-02-24 03:47:04+00:00,@derek_j_morris @pmarca Major problem,,Twitter for iPhone,82,1219,False
3538,1628961847364681731,Elon Musk,,,2009-06-02 20:12:29+00:00,130663340,182,19120,True,2023-02-24 03:35:43+00:00,@EvaFoxU Nice,,Twitter for iPhone,396,15353,False
3539,1628955011416743936,Elon Musk,,,2009-06-02 20:12:29+00:00,130663340,182,19120,True,2023-02-24 03:08:33+00:00,@DavidSacks A Russia-China alliance is inevita...,,Twitter for iPhone,4472,41972,False


Getting Text from tweets to Preprocess

In [3]:
tweets = df['text']
tweets

0       @BillyM2k I find the gold toe sock – inevitabl...
1                      Sock Con, the conference for socks
2       Always something new for the magazine cover an...
3                        @ExplainThisBob This guy gets it
4       Sock tech is so advanced that you can get pret...
                              ...                        
3536    Check out my spicy 🌶️ 🔥 OnlyGANs!! https://t.c...
3537                @derek_j_morris @pmarca Major problem
3538                                        @EvaFoxU Nice
3539    @DavidSacks A Russia-China alliance is inevita...
3540                                 @PeterDiamandis Yeah
Name: text, Length: 3541, dtype: object

In [4]:
type(tweets)

pandas.core.series.Series

Preprocessing with Porter Stemmer and wordnetLemmatizer

In [5]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

processedTweets = []

stopWords = set(stopwords.words('english'))

for tweet in tweets:
    # Convert the tweet to lowercase using REGEX
    for f in re.findall("([A-Z]+)", tweet):
        tweet = tweet.replace(f, f.lower())

    # Removing special characters and replacing them with a space
    tweet = re.sub("[^A-Za-z0-9]", " ", tweet, 0, re.IGNORECASE)
    
    # From a single sentence, store all the words 
    wordsInTweet = nltk.word_tokenize(tweet)
    
    # Filter out all the stop words
    wordsInTweet = [word for word in wordsInTweet if word not in stopWords]
    
    # Stem each of the tweets
    wordsInTweet = [stemmer.stem(word) for word in wordsInTweet if word not in stopWords]
    
    # Lemmatize each of the tweets
    wordsInTweet = [lemmatizer.lemmatize(word) for word in wordsInTweet]
    
    # Append them to a list
    processedTweets.append(wordsInTweet)
    
# print(processedTweets)

Putting Tweets Into Dataframe

In [6]:
processedTweetsDataFrame = pd.DataFrame(data=processedTweets)
processedTweetsDataFrame.to_csv('processed-tweets.csv')

processedTweetsDataFrame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,billym2k,find,gold,toe,sock,inevit,kilter,amp,wash,littl,...,esthet,amp,arguabl,bit,corpo,,,,,
1,sock,con,confer,sock,,,,,,,...,,,,,,,,,,
2,alway,someth,new,magazin,cover,articl,practic,write,,,...,,,,,,,,,,
3,explainthisbob,guy,get,,,,,,,,...,,,,,,,,,,
4,sock,tech,advanc,get,pretti,much,anyth,sock,form,day,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3536,check,spici,onlygan,http,co,b9lnjsenvo,,,,,...,,,,,,,,,,
3537,derek,j,morri,pmarca,major,problem,,,,,...,,,,,,,,,,
3538,evafoxu,nice,,,,,,,,,...,,,,,,,,,,
3539,davidsack,russia,china,allianc,inevit,grow,much,stronger,time,,...,,,,,,,,,,
