# Emotion Detection of Twitter Data
## Dataset used:
### Sentiment140 dataset with 1.6 million tweets (Kaggle)

### Libraries

In [1]:
import pandas as pd
import preprocessor as p
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
cols = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv("../../Data/training_1600000.csv", names=cols, encoding="ISO-8859-1")
# df.columns = ["target", "ids", "date", "flag", "user", "text"]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
df.tail()

Unnamed: 0,target,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [8]:
# Dividing the input text data and the associated output sentiment vector
twts, y = df["text"], df["target"]

### 1. Using the tweet-preprocessor library to clean the data (like removing #tags, @mentions, emojis, smileys, reserved words) <br>Then lowercasing the words and finally resolving the contractions.
### 2. Preprocessing the data by tokenizing, removing StopWords and Lemmatizing the words in each sentences

In [9]:
def clean_twts(twts):
    # applying tweet cleaning, then lowercasing and finally resolving contractions
    cleaned_twts = twts.map(lambda txt: contractions.fix(p.clean(txt).lower()))
    # removing special characters
    cleaned_twts = cleaned_twts.map(lambda txt: ''.join(word if word.isalnum() or word.isspace() else ' ' for word in txt))

    return cleaned_twts

def process_twts(twts):
    # Tokenizing each sentences
    tokenized_twts = twts.map(word_tokenize)
    # Removing Stop Words from sentences
    # Lemmatizing words in text
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    lemmatiized_twts = tokenized_twts.map(lambda txt: [lemmatizer.lemmatize(word) for word in txt if word not in stop_words])

    return lemmatiized_twts

In [10]:
# Cleaning all tweets
cleaned_twts = clean_twts(twts)

In [11]:
# Processing all tweets by tokenizing, removing Stopwords and Lemmatizing words
processed_twts = process_twts(cleaned_twts)

In [12]:
i = 10000
print("Raw: ", twts[i])
print("Cleaned: ", cleaned_twts[i])
print("Processed: ", processed_twts[i])

Raw:  I think there's a problem with the ISP in this area or something...my connection go too slow to do anything online yesterday &amp; today 
Cleaned:  i think there is a problem with the isp in this area or something   my connection go too slow to do anything online yesterday  amp  today
Processed:  ['think', 'problem', 'isp', 'area', 'something', 'connection', 'go', 'slow', 'anything', 'online', 'yesterday', 'amp', 'today']


In [31]:
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
print(lemmatizer.lemmatize("beautiful"))
print(stemmer.stem("beautiful"))

beautiful
beauti


In [80]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\subho\AppData\Roaming\nltk_data...


True

In [13]:
minLen = 140
maxLen = 0
sum = 0
for i, txt in enumerate(processed_twts):
    length = len(txt)
    sum += length
    if length < minLen:
        minLen = length
        print(i, " min: ", txt)
    if length > maxLen:
        maxLen = length
        print(i, " max: ", txt)

print(minLen, maxLen)
print("average: ", sum//1600000)

0  min:  ['awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day']
0  max:  ['awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day']
1  max:  ['upset', 'update', 'facebook', 'texting', 'might', 'cry', 'result', 'school', 'today', 'also', 'blah']
3  min:  ['whole', 'body', 'feel', 'itchy', 'like', 'fire']
4  min:  ['behaving', 'mad', 'see']
5  min:  ['whole', 'crew']
8  min:  ['nope']
31  max:  ['want', 'go', 'promote', 'gear', 'groove', 'unfornately', 'ride', 'may', 'b', 'going', 'one', 'anaheim', 'may', 'though']
39  max:  ['bed', 'class', '12', 'work', '3', 'gym', '5', 'class', '10', 'another', 'day', 'going', 'fly', 'miss', 'girlfriend']
57  max:  ['sad', 'feeling', 'dallas', 'going', 'show', 'got', 'say', 'though', 'would', 'think', 'show', 'would', 'use', 'music', 'game', 'mmm']
83  min:  []
126  max:  ['wah', 'see', 'clip', 'must', 'el', 'stupido', 'work', 'filter', 'wait', 'till', 'get', 'puter', 'something', 'else', 'blame', 'ex', 'broke', 'mine']
679  

In [3]:
print(p.clean("Hello there @subhodip :) that is great 241 www.ddg.com https://www.ddg.com abc@ddg.com"))

Hello there that is great www.ddg.com abc.com
