In [None]:
!pip install nltk==3.3

Collecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.2MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-cp36-none-any.whl size=1394472 sha256=85e990a8e710d7d1fa69cdfee006db99ab9363da5067356b1e12b98ee383250d
  Stored in directory: /root/.cache/pip/wheels/d1/ab/40/3bceea46922767e42986aef7606a600538ca80de6062dc266c
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.3


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/text_emotion.csv')
data = data.drop('author', axis=1)

In [None]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [None]:
data.shape

(40000, 3)

In [None]:
# Dropping rows with other emotion labels
data = data.drop(data[data.sentiment == 'anger'].index)
data = data.drop(data[data.sentiment == 'boredom'].index)
data = data.drop(data[data.sentiment == 'enthusiasm'].index)
data = data.drop(data[data.sentiment == 'empty'].index)
data = data.drop(data[data.sentiment == 'fun'].index)
data = data.drop(data[data.sentiment == 'relief'].index)
data = data.drop(data[data.sentiment == 'surprise'].index)
data = data.drop(data[data.sentiment == 'love'].index)
data = data.drop(data[data.sentiment == 'hate'].index)
data = data.drop(data[data.sentiment == 'neutral'].index)
data = data.drop(data[data.sentiment == 'worry'].index)


In [None]:
data.shape

(10374, 3)

In [None]:
#we have to take care of countless combinations, special characters, and not to mention, 
#the SMS lingo and slang for which even the dictionary can’t be used for reference.

#First, let’s bring some uniformity to the text by making everything lowercase, removing 
##punctuation, and stop words (like prepositions).

# Now convertibg all the letters to lower cases
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Now Removing Punctuation and also Symbols
data['content'] = data['content'].str.replace('[^\w\s]',' ')

# Now Removing Stop Words- prepositions etc using NLTK
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


#To gain any proper insight, we need to get all the words to their root form
#i.e the variants of a word within the text (for example plural forms, past tense, etc) 
#must all be converted to the base word it represents.
#This is called lemmatisation. 
#Lemmatisation code
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

#After that, we added code to revert repetition of letters in a word with the assumption that hardly any word 
#has letters repeated more than twice, consecutively.
#Though not very accurate, it can help in some corrections.
#Correcting Letter Repetitions - which are repeated more than twice continuously
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

#Next we think that if a word is appearing only once in the entire sample of data
#then it most likely has no influence in determining the sentiment of the text. 
#Hence we can remove all the rarely occurring words from the dataset 
#which are generally proper nouns and other insignificant words with respect to the current context.

# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [None]:
#Now the text data clean, precise, and error-free, each tweet is represented by a group of keywords. 
#Now, we need to perform ‘Feature Extraction’, i.e extracting some parameters from the data that can be presented numerically.
#In this article, we consider two different features, TF-IDF & Count Vectors

#We are changing the words - classes => 'sadness' as '1' & 'happiness' as '0'
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.sentiment.values)

# Splitting into training and testing data in 90:10 ratio
X_train, X_val, y_train, y_val = train_test_split(data.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [None]:
#This parameter gives the relative importance of a term in the data
#is a measure of how frequently and rarely it appears in the text. 
#This can be directly extracted in python as follows

# Extracting TF-IDF parameters here - 
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

#This is another feature we consider and 
#as the name suggests we transform our tweet into an array having the count of appearances of each word in it.
#The intuition here is that the text that conveys similar emotions may have the same words repeated over and over again.

# Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(data['content'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [None]:
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val))


# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))


# Model 3: logistic regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))


# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_val_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val))



naive bayes tfidf accuracy 0.5597302504816956
svm using tfidf accuracy 0.5789980732177264
log reg tfidf accuracy 0.5789980732177264
random forest tfidf accuracy 0.5549132947976878


In [None]:

#The best model had an accuracy of just 54.43% (Logistic Regression) 
#which implies that our model is hardly classifying anything properly. 
#This is no good. This might be because of the complex nature of the textual dataset we are using.
#so let’s build models using count vectors features
## Building models using count vectors feature
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))


# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))


# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))


# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_val_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))

#now we can see that the accuracy is around 78.6
#This might be because of the nature of this specific dataset where the emotion of the text is heavily dependent on the presence of some significant adjectives.


naive bayes count vectors accuracy 0.7736030828516378
lsvm using count vectors accuracy 0.7861271676300579
log reg count vectors accuracy 0.7832369942196532
random forest with count vectors accuracy 0.7620423892100193


In [None]:
!pip install speechrecognition

Collecting speechrecognition
[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)
[K     |████████████████████████████████| 32.8MB 119kB/s 
[?25hInstalling collected packages: speechrecognition
Successfully installed speechrecognition-3.8.1


In [None]:
!pip install pipwin

Collecting pipwin
  Downloading https://files.pythonhosted.org/packages/08/0f/a7df1770d2dcf99898aee562d6ce866e5dc78a5ccbf4ff25231ece4c99e8/pipwin-0.5.0-py2.py3-none-any.whl
Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Collecting js2py
[?25l  Downloading https://files.pythonhosted.org/packages/f4/6a/0385641ad1b52aae5c63820277a10e500c19e40fc4df5287f161aa287020/Js2Py-0.70-py3-none-any.whl (605kB)
[K     |████████████████████████████████| 614kB 5.2MB/s 
[?25hCollecting pySmartDL>=1.3.1; python_version >= "3.4"
  Downloading https://files.pythonhosted.org/packages/ac/6a/582286ea74c54363cba30413214767904f0a239e12253c3817feaf78453f/pySmartDL-1.3.4-py3-none-any.whl
Collecting beautifulsoup4>=4.9.0
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB

In [None]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libasound2-dev is already the newest version (1.1.3-5ubuntu0.5).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 14 not upgraded.
Need to get 184 kB of archives.
After this operation, 891 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudiocpp0 amd64 19.6.0-1 [15.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 portaudio19-dev amd64 19.6.0-1 [104 kB]
Fetched 184 kB in 1s (236 kB/s)
Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 144865 files and directories currently installed.)
Preparing to 

In [None]:
!pip install pyaudio

Collecting pyaudio
  Downloading https://files.pythonhosted.org/packages/ab/42/b4f04721c5c5bfc196ce156b3c768998ef8c0ae3654ed29ea5020c749a6b/PyAudio-0.2.11.tar.gz
Building wheels for collected packages: pyaudio
  Building wheel for pyaudio (setup.py) ... [?25l[?25hdone
  Created wheel for pyaudio: filename=PyAudio-0.2.11-cp36-cp36m-linux_x86_64.whl size=51612 sha256=5631798ffb71fb8455d527ed5b5fe42ca894fa83948c20768ccba48dcc67b1bf
  Stored in directory: /root/.cache/pip/wheels/f4/a8/a4/292214166c2917890f85b2f72a8e5f13e1ffa527c4200dcede
Successfully built pyaudio
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.11


In [None]:
!pip install pyttsx3

Collecting pyttsx3
  Downloading https://files.pythonhosted.org/packages/33/9a/de4781245f5ad966646fd276259ef7cfd400ba3cf7d5db7c0d5aab310c20/pyttsx3-2.90-py3-none-any.whl
Installing collected packages: pyttsx3
Successfully installed pyttsx3-2.90


In [None]:
import os,glob

In [None]:
"""import speech_recognition as sr
print(sr.__version__)
r = sr.Recognizer()
for file in glob.glob("/content/drive/MyDrive/happy-sad/*.wav"):
        #file_name=os.path.basename(file)
        print(file)
        file_audio = sr.AudioFile(file)
        with file_audio as source:
          audio_text = r.record(source)
        print(type(audio_text))
        print(r.recognize_google(audio_text))"""

'import speech_recognition as sr\nprint(sr.__version__)\nr = sr.Recognizer()\nfor file in glob.glob("/content/drive/MyDrive/happy-sad/*.wav"):\n        #file_name=os.path.basename(file)\n        print(file)\n        file_audio = sr.AudioFile(file)\n        with file_audio as source:\n          audio_text = r.record(source)\n        print(type(audio_text))\n        print(r.recognize_google(audio_text))'

In [None]:
import speech_recognition as sr
print(sr.__version__)
r = sr.Recognizer()
file_audio = sr.AudioFile("/content/drive/MyDrive/nlp22.wav")
with file_audio as source:
  audio_text = r.record(source)
print(type(audio_text))
print(r.recognize_google(audio_text))

3.8.1
<class 'speech_recognition.AudioData'>
it is Christmas and latest all enjoy I am very excited about the kids there will be sweet and dance. Cakes will be wonderful but I am so sad that I can't see my friends I will miss them definitely I want to enjoy with my friend it will be a lot more fun in college


In [None]:
#Below are 8 random statements. The first 4 depict happiness. The last 4 depict sadness
#happiness = 0 sadness=1
tweets = pd.DataFrame(['It is Christmas and let us all enjoy', 
                       'I am very excited about the gifts', 
                       'There will be sweets and dance', 
                       'Oh, the cakes will be wonderful', 
                       'But I am so sad that I cant see my friends', 
                       'I will miss then definitely',
                       'I want to enjoy with my friends',
                       'It will be a lot more fun in college'])

# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])

#Predicting the emotion of the tweet using our already trained linear SVM
tweet_pred = lsvm.predict(tweet_count)
print(tweet_pred)


[0 0 0 0 1 1 1 0]
