In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np
import os
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# I got a twitter sample labelled data from stanfrod and used it to train the model
# http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
# it is stored in my drive at
# https://drive.google.com/drive/folders/1fyP_tBYC_ypFFasTKOFRgMhamD6QYsZc

In [8]:
nltk.download('twitter_samples')
nltk.download('stopwords')
stopwords_engligh = stopwords.words('english')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/ravibyakod/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravibyakod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
data = pd.read_csv("../data/raw/tweet_dataset.csv")

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,Tweet Id,Tweet Date,Follower Count,Account Verified,Favorite Count,Retweets,Tweet Text,username,description,location,following,followers,totaltweets,retweetcount,text,hashtags,entities
0,0,1515782070743748616,2022-04-17 19:59:42+00:00,362.0,False,0.0,0.0,"b'#GLCVY (GELECEK VARLIK), the company is the ...",123mavi123,Paylaştığım tüm yazılar bilgilendirme amaçlı o...,,137.0,362.0,669.0,0.0,"#GLCVY (GELECEK VARLIK), the company is the le...","['GLCVY', 'bist', 'tsla', 'aapl']","{'hashtags': [{'text': 'GLCVY', 'indices': [0,..."
1,1,1515781883140870144,2022-04-17 19:58:57+00:00,131.0,False,0.0,0.0,"b'Tesla FSD 10.11.2 - MILPITAS, CA - Amazing d...",fsd_tesla,Tesla FSD tester - Accelerate the transition t...,San Francisco,41.0,131.0,309.0,0.0,"Tesla FSD 10.11.2 - MILPITAS, CA - Amazing dri...","['FSD', 'Tesla', 'FSDBeta', 'TSLA', 'BayArea']","{'hashtags': [{'text': 'FSD', 'indices': [73, ..."
2,2,1515779704019595269,2022-04-17 19:50:17+00:00,75.0,False,1.0,0.0,b'@ryancohen @elonmusk @ChrisJBakke Power to t...,tino_phiathep,💎💎💎💎🚀🚀🚀🦍🦍🌚🌚🌚\nDiamond Balling my AMC and GME t...,"Menifee, CA",133.0,75.0,1207.0,0.0,@ryancohen @elonmusk @ChrisJBakke Power to the...,"['tsla', 'AMC', 'GME']","{'hashtags': [{'text': 'tsla', 'indices': [59,..."
3,3,1515778067150819328,2022-04-17 19:43:47+00:00,6.0,False,0.0,0.0,"b'Tesla, SpaceX, Boring, Pay Pal is a big deal...",iCare48150160,Nature Photograph Innovation Crypto Musk Feder...,,100.0,6.0,117.0,0.0,"Tesla, SpaceX, Boring, Pay Pal is a big deal b...",[],"{'hashtags': [], 'symbols': [], 'user_mentions..."
4,4,1515777929082556424,2022-04-17 19:43:14+00:00,7722.0,False,3.0,1.0,b'#HappyEaster to Mr. Tenderonie Man himself @...,2HOT4AMC,The cult classic. The diamond in the rough. TH...,,1083.0,7722.0,9251.0,1.0,#HappyEaster to Mr. Tenderonie Man himself @el...,['HappyEaster'],"{'hashtags': [{'text': 'HappyEaster', 'indices..."


In [11]:
tweet = data["text"][0]

In [12]:
data.columns

Index(['Unnamed: 0', 'Tweet Id', 'Tweet Date', 'Follower Count',
       'Account Verified', 'Favorite Count', 'Retweets', 'Tweet Text',
       'username', 'description', 'location', 'following', 'followers',
       'totaltweets', 'retweetcount', 'text', 'hashtags', 'entities'],
      dtype='object')

In [13]:
df = data[['Tweet Id', 'text', 'Tweet Date','followers','Account Verified','Favorite Count']]
df.head()

Unnamed: 0,Tweet Id,text,Tweet Date,followers,Account Verified,Favorite Count
0,1515782070743748616,"#GLCVY (GELECEK VARLIK), the company is the le...",2022-04-17 19:59:42+00:00,362.0,False,0.0
1,1515781883140870144,"Tesla FSD 10.11.2 - MILPITAS, CA - Amazing dri...",2022-04-17 19:58:57+00:00,131.0,False,0.0
2,1515779704019595269,@ryancohen @elonmusk @ChrisJBakke Power to the...,2022-04-17 19:50:17+00:00,75.0,False,1.0
3,1515778067150819328,"Tesla, SpaceX, Boring, Pay Pal is a big deal b...",2022-04-17 19:43:47+00:00,6.0,False,0.0
4,1515777929082556424,#HappyEaster to Mr. Tenderonie Man himself @el...,2022-04-17 19:43:14+00:00,7722.0,False,3.0


In [14]:
#Convering data frame coliumn to string
df = df.astype({'text':'string'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1992 entries, 0 to 1991
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Tweet Id          1958 non-null   object 
 1   text              1920 non-null   string 
 2   Tweet Date        1958 non-null   object 
 3   followers         1920 non-null   float64
 4   Account Verified  1920 non-null   object 
 5   Favorite Count    1920 non-null   float64
dtypes: float64(2), object(3), string(1)
memory usage: 93.5+ KB


## Preprocessing Steps

 - Deleted the missed rows
 - Lowercase
 - Remove punctuations, urls,name
 - Remove stop words
 - Stemming/ Lemmatization
 - Tokenize Sentences



In [15]:
# Delete the missed data rows
df.isnull().sum()
df.dropna(inplace=True)

In [16]:
#Stemming the tweets
stemmer = PorterStemmer()

#print(stopwords_engligh)

In [17]:
def process_tweet(tweet):
  
    tweet2 = re.sub(r'^RT[\s]+','', tweet)
   
    #remove hyperlinks
    tweet2 = re.sub(r'https?://[^\s\n\r]+', '', tweet2)
    
    #remove hashtag by removing the hast #sign from the word
    tweet2 = re.sub(r'#','',tweet2)
    
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    #tokenize the sentences and make it lower case
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet2)
    
    #stemming and removing the punctuation
    tweets_clean = []
    for word in tweet_tokens:
        if(word not in stopwords_engligh and  
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [18]:
#Added column clean_tweet to store te processed tweets 
df['clean_tweet'] = df['text'].map(lambda x : process_tweet(x))

In [19]:
df.head()

Unnamed: 0,Tweet Id,text,Tweet Date,followers,Account Verified,Favorite Count,clean_tweet
0,1515782070743748616,"#GLCVY (GELECEK VARLIK), the company is the le...",2022-04-17 19:59:42+00:00,362.0,False,0.0,"[glcvi, gelecek, varlik, compani, leader, sect..."
1,1515781883140870144,"Tesla FSD 10.11.2 - MILPITAS, CA - Amazing dri...",2022-04-17 19:58:57+00:00,131.0,False,0.0,"[tesla, fsd, 10.11, 2, milpita, ca, amaz, driv..."
2,1515779704019595269,@ryancohen @elonmusk @ChrisJBakke Power to the...,2022-04-17 19:50:17+00:00,75.0,False,1.0,"[power, player, lfg, tsla, amc, gme]"
3,1515778067150819328,"Tesla, SpaceX, Boring, Pay Pal is a big deal b...",2022-04-17 19:43:47+00:00,6.0,False,0.0,"[tesla, spacex, bore, pay, pal, big, deal, tec..."
4,1515777929082556424,#HappyEaster to Mr. Tenderonie Man himself @el...,2022-04-17 19:43:14+00:00,7722.0,False,3.0,"[happyeast, mr, tenderoni, man, realli, go, se..."


In [20]:
#write the processed tweet data to CsV file
df.to_csv('../data/processed/processed_tweets.csv')