# Emotion Detection of Twitter Data
## Dataset used:
### Sentiment140 dataset with 1.6 million tweets (Kaggle)

### Libraries

In [1]:
import pandas as pd
import preprocessor as p
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
cols = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv("../../Data/training_1600000.csv", names=cols, encoding="ISO-8859-1")
# df.columns = ["target", "ids", "date", "flag", "user", "text"]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df.tail()

Unnamed: 0,target,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [4]:
print(df["target"].value_counts())
df.replace({"target": {4:1}}, inplace=True)

target
0    800000
4    800000
Name: count, dtype: int64


In [5]:
# Dividing the input text data and the associated output sentiment vector
twts, y = df["text"][600000: 1000000], df["target"][600000: 1000000]

### 1. Using the tweet-preprocessor library to clean the data (like removing #tags, @mentions, emojis, smileys, reserved words) <br>Then lowercasing the words and finally resolving the contractions.
### 2. Preprocessing the data by tokenizing, removing StopWords and Lemmatizing the words in each sentences

In [6]:
# def clean_twts(twts):
#     # applying tweet cleaning, then lowercasing and finally resolving contractions
#     cleaned_twts = twts.map(lambda txt: contractions.fix(p.clean(txt).lower()))
#     # removing special characters
#     cleaned_twts = cleaned_twts.map(lambda txt: ''.join(word if word.isalpha() or word.isspace() else ' ' for word in txt))

#     return cleaned_twts

def process_twts(twts):
    # applying tweet cleaning, then lowercasing and finally resolving contractions
    # removing special characters
    cleaned_twts = twts.map(lambda txt: ''.join(word if word.isalpha() or word.isspace() else ' ' for word in contractions.fix(p.clean(txt).lower())))
    # Tokenizing each sentences
    tokenized_twts = cleaned_twts.map(word_tokenize)
    # Removing Stop Words from sentences
    # Lemmatizing words in text
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    lemmatiized_twts = tokenized_twts.map(lambda txt: [lemmatizer.lemmatize(word) for word in txt if word not in stop_words])

    return lemmatiized_twts

In [7]:
# Processing all tweets by tokenizing, removing Stopwords and Lemmatizing words
processed_twts = process_twts(twts)

### Checking the processed data

In [8]:
i = 800000
print("Raw: ", twts[i])
print("Processed: ", processed_twts[i])

Raw:  I LOVE @Health4UandPets u guys r the best!! 
Processed:  ['love', 'guy', 'r', 'best']


### Splitting the dataset

In [9]:
x_train, x_test, y_train, y_test = train_test_split(processed_twts, y, test_size=0.2, stratify=y, random_state=5)
print(x_train.shape, x_test.shape)

(320000,) (80000,)


## Vectorizing the dataset using Tf-Idf value according to train data

In [10]:
# This is just to pass by the tokenizing in TfidfVectorizer class
def tokeniz(token):
    return token

vectorizer_uni = TfidfVectorizer(tokenizer=tokeniz, lowercase=False)
vectorizer_uni_bi = TfidfVectorizer(tokenizer=tokeniz, ngram_range=(1, 2), lowercase=False)

x_train_vectorized_uni = vectorizer_uni.fit_transform(x_train)
x_train_vectorized_uni_bi = vectorizer_uni_bi.fit_transform(x_train)
x_test_vectorized_uni = vectorizer_uni.transform(x_test)
x_test_vectorized_uni_bi = vectorizer_uni_bi.transform(x_test)



In [11]:
print(x_train_vectorized_uni[:5].toarray())
print(x_train_vectorized_uni.shape)
print(vectorizer_uni.get_feature_names_out())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(320000, 94075)
['aa' 'aaa' 'aaaa' ... 'zzzzzzzzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz']


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
logRegModel_uni = LogisticRegression(max_iter=1000)
logRegModel_uni_bi = LogisticRegression(max_iter=1000)
NBModel_uni = GaussianNB()
NBModel_uni_bi = GaussianNB()

In [13]:
logRegModel_uni.fit(x_train_vectorized_uni, y_train)
logRegModel_uni_bi.fit(x_train_vectorized_uni_bi, y_train)

In [77]:
NBModel_uni.fit(x_test_vectorized_uni.toarray(), y_train)
NBModel_uni_bi.fit(x_test_vectorized_uni_bi.toarray(), y_train)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [14]:
y_pred_logReg_uni = logRegModel_uni.predict(x_test_vectorized_uni)
y_pred_logReg_uni_bi = logRegModel_uni_bi.predict(x_test_vectorized_uni_bi)

In [15]:
from sklearn.metrics import classification_report
print("Logistic regression using unigram report: \n", classification_report(y_test, y_pred_logReg_uni))
print("Logistic regression using bigram report: \n", classification_report(y_test, y_pred_logReg_uni_bi))

Logistic regression using unigram report: 
               precision    recall  f1-score   support

           0       0.79      0.76      0.78     40000
           1       0.77      0.80      0.78     40000

    accuracy                           0.78     80000
   macro avg       0.78      0.78      0.78     80000
weighted avg       0.78      0.78      0.78     80000

Logistic regression using bigram report: 
               precision    recall  f1-score   support

           0       0.79      0.78      0.79     40000
           1       0.79      0.80      0.79     40000

    accuracy                           0.79     80000
   macro avg       0.79      0.79      0.79     80000
weighted avg       0.79      0.79      0.79     80000



In [82]:
print(logRegModel_uni.predict(vectorizer_uni.transform(process_twts(pd.Series(["I do not want to do this job"])))))

[0]


### test purpose

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "hello there whats up i am xyz",
    "hello this is my name",
    "this is my github",
    "i am a bad boy"
]

vec = TfidfVectorizer(ngram_range=(1, 1))
x = vec.fit_transform(corpus)

In [40]:
print(x.toarray())
print(vec.get_feature_names_out())
print(vec.vocabulary_)


[[0.34431452 0.         0.         0.         0.34431452 0.
  0.         0.         0.43671931 0.         0.43671931 0.43671931
  0.43671931]
 [0.         0.         0.         0.         0.4222466  0.4222466
  0.4222466  0.53556627 0.         0.4222466  0.         0.
  0.        ]
 [0.         0.         0.         0.59081908 0.         0.46580855
  0.46580855 0.         0.         0.46580855 0.         0.
  0.        ]
 [0.48693426 0.61761437 0.61761437 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]]
['am' 'bad' 'boy' 'github' 'hello' 'is' 'my' 'name' 'there' 'this' 'up'
 'whats' 'xyz']
{'hello': 4, 'there': 8, 'whats': 11, 'up': 10, 'am': 0, 'xyz': 12, 'this': 9, 'is': 5, 'my': 6, 'name': 7, 'github': 3, 'bad': 1, 'boy': 2}


In [13]:
minLen = 140
maxLen = 0
sum = 0
for i, txt in enumerate(processed_twts):
    length = len(txt)
    sum += length
    if length < minLen:
        minLen = length
        print(i, " min: ", txt)
    if length > maxLen:
        maxLen = length
        print(i, " max: ", txt)

print(minLen, maxLen)
print("average: ", sum//1600000)

0  min:  ['awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day']
0  max:  ['awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day']
1  max:  ['upset', 'update', 'facebook', 'texting', 'might', 'cry', 'result', 'school', 'today', 'also', 'blah']
3  min:  ['whole', 'body', 'feel', 'itchy', 'like', 'fire']
4  min:  ['behaving', 'mad', 'see']
5  min:  ['whole', 'crew']
8  min:  ['nope']
31  max:  ['want', 'go', 'promote', 'gear', 'groove', 'unfornately', 'ride', 'may', 'b', 'going', 'one', 'anaheim', 'may', 'though']
39  max:  ['bed', 'class', '12', 'work', '3', 'gym', '5', 'class', '10', 'another', 'day', 'going', 'fly', 'miss', 'girlfriend']
57  max:  ['sad', 'feeling', 'dallas', 'going', 'show', 'got', 'say', 'though', 'would', 'think', 'show', 'would', 'use', 'music', 'game', 'mmm']
83  min:  []
126  max:  ['wah', 'see', 'clip', 'must', 'el', 'stupido', 'work', 'filter', 'wait', 'till', 'get', 'puter', 'something', 'else', 'blame', 'ex', 'broke', 'mine']
679  

In [3]:
print(p.clean("Hello there @subhodip :) that is great 241 www.ddg.com https://www.ddg.com abc@ddg.com"))

Hello there that is great www.ddg.com abc.com
