In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np
import os
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
nltk.download('twitter_samples')
nltk.download('stopwords')
stopwords_engligh = stopwords.words('english')

In [None]:
data = pd.read_csv("../data/raw/tweet_dataset.csv")

In [None]:
data.head()

In [None]:
tweet = data["text"][0]

In [None]:
data.columns

In [None]:
df = data[['Tweet Id', 'text', 'Tweet Date','followers','Account Verified','Favorite Count']]
df.head()

In [None]:
#Convering data frame coliumn to string
df = df.astype({'text':'string'})
df.info()

## Preprocessing Steps

 - Deleted the missed rows
 - Lowercase
 - Remove punctuations, urls,name
 - Remove stop words
 - Stemming/ Lemmatization
 - Tokenize Sentences



In [None]:
# Delete the missed data rows
df.isnull().sum()
df.dropna(inplace=True)

In [None]:
#Stemming the tweets
stemmer = PorterStemmer()

#print(stopwords_engligh)

In [None]:
def process_tweet(tweet):
  
    tweet2 = re.sub(r'^RT[\s]+','', tweet)
   
    #remove hyperlinks
    tweet2 = re.sub(r'https?://[^\s\n\r]+', '', tweet2)
    
    #remove hashtag by removing the hast #sign from the word
    tweet2 = re.sub(r'#','',tweet2)
    
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    #tokenize the sentences and make it lower case
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet2)
    
    #stemming and removing the punctuation
    tweets_clean = []
    for word in tweet_tokens:
        if(word not in stopwords_engligh and  
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean

In [None]:
#Added column clean_tweet to store te processed tweets 
df['clean_tweet'] = df['text'].map(lambda x : process_tweet(x))

In [None]:
df.head()

In [None]:
#write the processed tweet data to CsV file
df.to_csv('../data/processed/processed_tweets.csv')