<a href="https://colab.research.google.com/github/shila121/NLP_ml/blob/main/speech_sentiments_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem Statement: love or hate (sentiment) analysis

In [2]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#1. load the dataset
dataset = pd.read_csv('final_dataset_basicmlmodel.csv')

print(dataset.head())


   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation


In [3]:
# label column indicates hate or love speech
# tweet column represents text sentences containing sentiments of the users

for index,tweet in enumerate(dataset['tweet'][10:20]):
  print(index+1,'.',tweet)

1 .  â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
2 . we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #
3 . i get to see my daddy today!!   #80days #gettingfed
4 . ouch...junior is angryð#got7 #junior #yugyoem   #omg 
5 . i am thankful for having a paner. #thankful #positive     
6 . its #friday! ð smiles all around via ig user: @user #cookies make people   
7 . as we all know, essential oils are not made of chemicals. 
8 . #euro2016 people blaming ha for conceded goal was it fat rooney who gave away free kick knowing bale can hit them from there.  
9 . sad little dude..   #badday #coneofshame #cats #pissed #funny #laughs 
10 . product of the day: happy man #wine tool  who's   it's the #weekend? time to open up &amp; drink up!


In [7]:
# 2.Data cleaning/Text cleaning
#  '#' needs to be cleaned
# unicode symbols are there that needs to be removed.
# numeric percentage symbols are there that needs to be removed

import re

def clean_text(text):

  # filter to allow only alphabets
  text = re.sub(r'[^a-zA-Z\']',' ',text)

  # remove unicode characters
  text = re.sub(r'[^x00-\x7F]+',' ',text)

  # convert to lower case to maintain consistency
  text = text.lower()

  return text


In [8]:
dataset['clean_text'] = dataset.tweet.apply(lambda x:clean_text(x))
dataset

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can t use ...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation
...,...,...,...,...
5237,31935,1,lady banned from kentucky mall. @user #jcpenn...,lady banned from kentucky mall user jcpenny ke...
5238,31947,1,@user omfg i'm offended! i'm a mailbox and i'...,user omfg i m offended i m a mailbox and i m ...
5239,31948,1,@user @user you don't have the balls to hashta...,user user you don t have the balls to hashtag...
5240,31949,1,"makes you ask yourself, who am i? then am i a...",makes you ask yourself who am i then am i any...


In [9]:
# 3.Feature engineering
#Exhaustive list of stopwords in the english language. We want to focus less on these so at some point will have to filter
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']



In [11]:
# generate word frequency

def gen_freq(text):

  word_list = []

  for i in text.split():
    word_list.extend(i)

  word_freq = pd.Series(word_list).value_counts()

  word_freq = word_freq.drop(STOP_WORDS,errors = 'ignore')

  return word_freq

# check if any negation term is present
def any_neg(words):
  for word in words:
    if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
      return 1
    else:
      return 0

# check whethr one of the 100 rare words are present

def any_rare(words,rare_100):
  for word in words:
    if word in rare_100:
      return 1
    else:
      return 0

# check whether prompt words are present 
def is_question(words):
  for word in words:
    if word in ['when', 'what', 'how', 'why', 'who']:
      return 1
    else:
      return 0

      



In [12]:
word_freq = gen_freq(dataset.clean_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Number of words in a tweet
dataset['word_count'] = dataset.clean_text.str.split().apply(lambda x: len(x))
#Negation present or not
dataset['any_neg'] = dataset.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
dataset['is_question'] = dataset.clean_text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
dataset['any_rare'] = dataset.clean_text.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
dataset['char_count'] = dataset.clean_text.apply(lambda x: len(x))

In [13]:
#Top 10 common words are
gen_freq(dataset.clean_text.str)[:10]

user      3351
s          608
amp        439
t          418
love       320
day        259
trump      225
happy      208
will       191
people     189
dtype: int64

In [14]:
dataset.head()

Unnamed: 0,id,label,tweet,clean_text,word_count,any_neg,is_question,any_rare,char_count
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...,18,0,0,0,97
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can t use ...,21,0,0,0,114
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,20
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur,12,0,0,0,47
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,34


In [15]:
# 4.split the dataset to train_test
X = dataset[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]
y = dataset.label

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state = 27)



In [16]:
# import the machine learning model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model = model.fit(X_train,y_train)

# make predictions
y_pred = model.predict(X_test)


In [18]:
# 5. Evaluate the model
from sklearn.metrics import accuracy_score

print('accuracy:',accuracy_score(y_test,y_pred)*100,'%')


accuracy: 57.52380952380952 %


In [19]:
print('accuracy:',accuracy_score(y_pred,y_test)*100,'%')

accuracy: 57.52380952380952 %
