<a href="https://colab.research.google.com/github/rukshan99/twitter-sentiment-analysis/blob/main/twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Connecting with Google Drive to get the data-set

In [1]:
# Import PyDrive and associated libraries
# This only needs to be done once per notebook
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
# This only needs to be done once per notebook
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
# Download a file based on its file ID.

# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1x4XtAigTb-fJ2TQJOCDXzAkVOsToBrCT' # Check your own ID in GDrive
downloaded = drive.CreateFile({'id': file_id})

# Save file in Colab memory
downloaded.GetContentFile('tweet_data.csv')  

## Normalization

In [3]:
# Regex package
import re

### Handle tweet features

In [4]:
# Example tweet
tweet = "RT @rukshaann I love this! 👍 https://rukshanjayasekara.me #Portfolio #Fun"

#### Retweet tag

In [5]:
# Handling the retweet(RT) tag
# It is not required for analysing sentiment
def replace_retweet(tweet, default_replace=""):
  tweet = re.sub('RT\s+', default_replace, tweet)
  return tweet

In [6]:
print("Processed tweet: {}".format(replace_retweet(tweet)))

Processed tweet: @rukshaann I love this! 👍 https://rukshanjayasekara.me #Portfolio #Fun


#### User tags

In [7]:
# Handling user tags(@)
# It is not required for analysing sentiment
def replace_user(tweet, default_replace="user"):
  tweet = re.sub('\B@\w+', default_replace, tweet)
  return tweet

In [8]:
print("Processed tweet: {}".format(replace_user(tweet)))

Processed tweet: RT user I love this! 👍 https://rukshanjayasekara.me #Portfolio #Fun


#### Emojis

Install and import `emoji` package

In [12]:
pip install emoji --upgrade

Collecting emoji
  Downloading emoji-1.6.1.tar.gz (170 kB)
[?25l[K     |██                              | 10 kB 18.7 MB/s eta 0:00:01[K     |███▉                            | 20 kB 23.0 MB/s eta 0:00:01[K     |█████▉                          | 30 kB 15.0 MB/s eta 0:00:01[K     |███████▊                        | 40 kB 10.2 MB/s eta 0:00:01[K     |█████████▋                      | 51 kB 5.2 MB/s eta 0:00:01[K     |███████████▋                    | 61 kB 5.4 MB/s eta 0:00:01[K     |█████████████▌                  | 71 kB 6.0 MB/s eta 0:00:01[K     |███████████████▍                | 81 kB 6.7 MB/s eta 0:00:01[K     |█████████████████▍              | 92 kB 6.2 MB/s eta 0:00:01[K     |███████████████████▎            | 102 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████▏          | 112 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████▏        | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████████       | 133 kB 5.3 MB/s eta 0:00:01[K     |███████

In [13]:
import emoji

In [14]:
# Replace emojis with meaningful text
def demojize(tweet):
  tweet = emoji.demojize(tweet)
  return tweet

In [15]:
print("Processed tweet: {}".format(demojize(tweet)))

Processed tweet: RT @rukshaann I love this! :thumbs_up: https://rukshanjayasekara.me #Portfolio #Fun


#### URLs

In [16]:

def replace_url(tweet, default_replace=""):
  tweet = re.sub('(http|https):\/\/\S+', default_replace, tweet)
  return tweet

In [17]:
print("Processed tweet: {}".format(replace_url(tweet)))

Processed tweet: RT @rukshaann I love this! 👍  #Portfolio #Fun


#### Hashtags

In [18]:
# Remove hashtag symbol(#)
def replace_hashtag(tweet, default_replace=""):
  tweet = re.sub('#+', default_replace, tweet)
  return tweet

In [19]:
print("Processed tweet: {}".format(replace_hashtag(tweet)))

Processed tweet: RT @rukshaann I love this! 👍 https://rukshanjayasekara.me Portfolio Fun


### Handle word features

In [20]:
# Example tweet
tweet = "LOOOOOOOOK at this ... I'd like it so much!"

#### Capitals

In [21]:
def to_lowercase(tweet):
  tweet = tweet.lower()
  return tweet

In [22]:
print("Processed tweet: {}".format(to_lowercase(tweet)))

Processed tweet: looooooook at this ... i'd like it so much!


#### Word repitions

In [23]:
def word_repetition(tweet):
  tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
  return tweet

In [24]:
print("Processed tweet: {}".format(word_repetition(tweet)))

Processed tweet: LOOK at this .. I'd like it so much!


#### Punctuation repetition

In [25]:
def punct_repetition(tweet, default_replace=""):
  tweet = re.sub(r'[\?\.\!]+(?=[\?\.\!])', default_replace, tweet)
  return tweet

In [26]:
print("Processed tweet: {}".format(punct_repetition(tweet)))

Processed tweet: LOOOOOOOOK at this . I'd like it so much!


#### Word contraction

Install and import the `contraction` package

In [27]:
pip install contractions

Collecting contractions
  Downloading contractions-0.0.58-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 5.7 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 46.3 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85450 sha256=f3eeada9cddea4e29aeab25826c3bf37e2323536296a98f0a8dceffe3d2208eb
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully install

In [28]:
import contractions

In [29]:
# Replace contractions with their extended forms
def fix_contractions(tweet):
  tweet = contractions.fix(tweet)
  return tweet

In [30]:
print("Processed tweet: {}".format(fix_contractions(tweet)))

Processed tweet: LOOOOOOOOK at this ... I would like it so much!


### Tokenization

Install and import the `NLTK` package

In [31]:
pip install nltk



In [32]:
import nltk

In [33]:
# Import the word_tokenize module from NLTK
from nltk.tokenize import word_tokenize
# Download the Punkt tokenizer model from NLTK
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [34]:
# Example tweet
tweet = "These are 5 different words!"

In [35]:
# Returns a tweet as a list of tokens
def tokenize(tweet):
  tokens = word_tokenize(tweet)
  return tokens

In [36]:
print(type(tokenize(tweet)))
print("Tweet tokens: {}".format(tokenize(tweet)))

<class 'list'>
Tweet tokens: ['These', 'are', '5', 'different', 'words', '!']


#### Stopwords

In [37]:
# Import the string package
import string
# Import the stopwords module from NLTK
from nltk.corpus import stopwords
# Download stopwords data from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [38]:
# Create a list of English stopwords
stop_words = set(stopwords.words('english'))
# Remove useful stopwords for sentiment analysis
stop_words.discard('not')

In [39]:
print(stop_words)

{'a', 'her', 't', 'is', 'under', 'me', 'his', 'too', 'all', 'our', "you've", 'why', 'than', 'over', 'can', 'or', 'm', 'here', 'some', 'such', 'hers', 'the', 'more', 'mightn', 'but', 'wasn', "should've", 'on', 'y', 'by', 'wouldn', 'having', 'any', 'didn', "you'll", 'my', 'ourselves', 'other', 'who', 'which', 'off', 'be', 'myself', 'had', 'them', 'yourself', 'am', 'needn', 'and', 'for', 'to', 'there', 'through', 'shouldn', 'where', 'isn', 'just', "wasn't", "mustn't", 'down', 'are', 'whom', 'been', 'both', 'will', 'mustn', 'how', 'these', 'll', 'once', 'out', 'herself', 'nor', 's', 'does', 'have', 've', "isn't", 'this', "it's", "doesn't", 'itself', 'do', 'each', 'doesn', 'an', 'ours', "needn't", 'after', "won't", 'only', 'its', 'as', 'being', 'if', 'no', 'few', 'in', 'you', 'your', "mightn't", 'during', 'before', 'below', 'was', "she's", "hadn't", "that'll", "wouldn't", "you'd", 'about', 'ain', 'what', 'yours', 'from', "shouldn't", 'did', 'between', 'o', "don't", 'above', 'themselves', 'h

Update the `tokenize()` function to handle stopwords, punctuations, alphanumerics etc.

In [40]:
def tokenize(tweet,
             keep_punct = False,
             keep_alnum = False,
             keep_stop = False):
  
  tokens = word_tokenize(tweet)

  if not keep_punct:
    tokens = [token for token in tokens
                  if token not in string.punctuation]

  if not keep_alnum:
    tokens = [token for token in tokens if token.isalpha()]
  
  if not keep_stop:
    stop_words = set(stopwords.words('english'))
    stop_words.discard('not')
    tokens = [token for token in tokens if not token in stop_words]
  return tokens

In [42]:
print("Tweet tokens: {}".format(tokenize(tweet, 
                                                keep_punct=True, 
                                                keep_alnum=True, 
                                                keep_stop=True)))
print("Tweet tokens: {}".format(tokenize(tweet, keep_stop=True)))
print("Tweet tokens: {}".format(tokenize(tweet)))

Tweet tokens: ['These', 'are', '5', 'different', 'words', '!']
Tweet tokens: ['These', 'are', 'different', 'words']
Tweet tokens: ['These', 'different', 'words']


### Stemming

In [43]:
# Import different libraries and modules used for stemming
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [44]:
# Example token list
tokens = ["manager", "management", "managing"]

In [45]:
# Defining different stemmers from NLTK package
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snoball_stemmer = SnowballStemmer('english')

In [46]:
# Returns a stemmed token list
# Should pass the stemmer a parameter
def stem_tokens(tokens, stemmer):
  token_list = []
  for token in tokens:
    token_list.append(stemmer.stem(token))
  return token_list

In [47]:
# PorterStemmer
print("Porter stems: {}".format(stem_tokens(tokens, porter_stemmer)))
# LancasterStemmer
print("Lancaster stems: {}".format(stem_tokens(tokens, lancaster_stemmer)))
# SnowballStemmer
print("Snowball stems: {}".format(stem_tokens(tokens, snoball_stemmer)))

Porter stems: ['manag', 'manag', 'manag']
Lancaster stems: ['man', 'man', 'man']
Snowball stems: ['manag', 'manag', 'manag']


Check over-stemming and under-stemming

In [48]:
tokens = ["international", "companies", "had", "interns"]

print("Porter stems: {}".format(stem_tokens(tokens, porter_stemmer)))
print("Lancaster stems: {}".format(stem_tokens(tokens, lancaster_stemmer)))
print("Snowball stems: {}".format(stem_tokens(tokens, snoball_stemmer)))

Porter stems: ['intern', 'compani', 'had', 'intern']
Lancaster stems: ['intern', 'company', 'had', 'intern']
Snowball stems: ['intern', 'compani', 'had', 'intern']


### Lemmatization

In [49]:
# Import different libraries and modules used for lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [50]:
# Example token list
tokens = ["international", "companies", "had", "interns"]

In [51]:
# Part of Speech (POS) tagging
word_type = {"international": wordnet.ADJ, 
             "companies": wordnet.NOUN, 
             "had": wordnet.VERB, 
             "interns": wordnet.NOUN
             }

In [52]:
# Create the lemmatizer by using the WordNet module
lemmatizer = WordNetLemmatizer()

In [53]:
# Takes the list of tokens as input and returns a list of lemmatized tokens
def lemmatize_tokens(tokens, word_type, lemmatizer):
  token_list = []
  for token in tokens:
    token_list.append(lemmatizer.lemmatize(token, word_type[token]))
  return token_list

In [54]:
print("Tweet lemma: {}".format(
    lemmatize_tokens(tokens, word_type, lemmatizer)))

Tweet lemma: ['international', 'company', 'have', 'intern']


### Final normalizing function

In [58]:
complex_tweet = r"""RT @rukshaann : hey looooook, 
THis is a big and complex TWeet!!! 👍 ... 
I'd be glad if you couldn't normalize it! 
Check https://rukshanjayasekara.me and LET ME KNOW!!! #NLP #Fun"""

In [56]:
def process_tweet(tweet, verbose=False):
  if verbose: print("Initial tweet: {}".format(tweet))

  ## Twitter Features
  tweet = replace_retweet(tweet) # replace retweet
  tweet = replace_user(tweet, "") # replace user tag
  tweet = replace_url(tweet) # replace url
  tweet = replace_hashtag(tweet) # replace hashtag
  if verbose: print("Post Twitter processing tweet: {}".format(tweet))

  ## Word Features
  tweet = to_lowercase(tweet) # lower case
  tweet = fix_contractions(tweet) # replace contractions
  tweet = punct_repetition(tweet) # replace punctuation repetition
  tweet = word_repetition(tweet) # replace word repetition
  tweet = demojize(tweet) # replace emojis
  if verbose: print("Post Word processing tweet: {}".format(tweet))

  ## Tokenization & Stemming
  tokens = tokenize(tweet, keep_alnum=False, keep_stop=False) # tokenize
  stemmer = SnowballStemmer("english") # define stemmer
  stem = stem_tokens(tokens, stemmer) # stem tokens

  return stem

In [59]:
print(process_tweet(complex_tweet, verbose=False))

['hey', 'look', 'big', 'complex', 'tweet', 'i', 'would', 'glad', 'could', 'not', 'normal', 'check', 'let', 'know', 'nlp', 'fun']


## Vectorization