<a href="https://colab.research.google.com/github/romasha-khurshid/walnut-brownies/blob/HateSpeechFilteration/HateSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiments Analysis Using Textblob

In [5]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer=ToktokTokenizer()
import spacy
nlp=spacy.load('en_core_web_sm',disable=['ner'])


**Data** **Loading** 

**Data** **Source**: https://www.kaggle.com/datasets/vkrahul/twitter-hate-speech?select=train_E6oV3lV.csv

In [7]:
data=pd.read_csv("/content/train_E6oV3lV.csv")
filtered_data=data.drop('id',axis=1)

In [None]:
print(filtered_data['label'].unique()) # check the unique values in the 'Class' column
train_label_1 = filtered_data[filtered_data['label'] == 1].sample(n=5000, replace=True, random_state=1) # set sample size to 5000
train_label_0 =filtered_data[filtered_data['label'] == 0].sample(n=3000, replace=True, random_state=2)
train=pd.concat([train_label_1,train_label_0])
train



In [None]:
from sklearn.utils import shuffle
train=shuffle(train)
train

In [10]:
train.isnull().sum()

label    0
tweet    0
dtype: int64

In [11]:
import numpy as np
train.replace(r'^\s*$',np.nan,regex=True,inplace=True)
train.dropna(axis=0,how='any',inplace=True)

In [12]:
train.replace(to_replace=["r\\t|\\n}\\r", "\t | \n|\r"], value=["",""] ,regex=True ,inplace=True)
print("escape sequence removed")

escape sequence removed


In [13]:
train['tweet']=train['tweet'].str.encode('ascii','ignore').str.decode('ascii')


In [None]:
def remove_punctuation(tweet):
  import string
  for punctuation in string.punctuation:
    tweet=tweet.replace(punctuation,'')
  return tweet
train['tweet']=train['tweet'].apply(remove_punctuation)
train



In [15]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
stopwords_list=nltk.corpus.stopwords.words('english')
stopwords_list.remove('no')
stopwords_list.remove('not')


In [17]:
from spacy import tokens
def remove_stopwords(tweet,is_lower_case=False):
  tokens=tokenizer.tokenize(tweet)
  tokens=[token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens=[token for token in tokens if token not in stopwords_list]
  else:
    filtered_tokens=[token for token in tokens if token.lower() not in stopwords_list]
  filtered_text=' '.join(filtered_tokens)
  return filtered_text

In [18]:
train['tweet']=train['tweet'].apply(remove_stopwords)


In [None]:
def remove_special_character(tweet):
  text=re.sub('[^a-zA-Z0-9\s]','',tweet)
  return text
train['tweet'] =train['tweet'].apply(remove_special_character)
train

In [20]:
def remove_html(tweet):
  import re
  html_pattern=re.compile('<.*?>')
  return html_pattern.sub(r' ',tweet)

In [None]:
train['tweet']=train['tweet'].apply(remove_html)
train

In [22]:
import re

def remove_URL(tweet):
    if not isinstance(tweet, str):
        tweet = str(tweet)
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r' ', tweet)


In [None]:
train['tweet']=train['tweet'].apply(remove_URL)   
train

In [None]:
train

In [25]:
def remove_digits(tweet):  
    return ''.join([i for i in tweet if not i.isdigit()])

train['tweet'] = train['tweet'].apply(remove_digits)

In [None]:
train

In [27]:
def cleanse(word):
   rx=re.compile(r'\D*\d')
   if rx.match(word):
       return ''
   return word
def remove_alphanumeric(strings):
  nstring=[" ".join(filter(None,(cleanse(word)for word in string.split())))  
  for string in strings.split()]
  str1=' '.join(nstring)
  return str1

In [28]:
train['tweet'] = train['tweet'].apply(remove_alphanumeric)

In [None]:
train

In [30]:
def lemmatize_text(tweet):
    tweet=nlp(tweet)
    tweet=' '.join([word.lemma_ if word.lemma_!= '-PRON-' else word.tweet for word in tweet])
    return tweet

In [None]:
train['tweet'] = train['tweet'].apply(lemmatize_text)
train

In [37]:
train['sentiments']=train['tweet'].apply(lambda tweet:TextBlob(tweet).sentiment)

In [38]:
sentiments_series=train['sentiments'].tolist()

In [39]:
columns=['Polarity','Subjectivity']

In [45]:
df=pd.DataFrame(sentiments_series,columns=columns,index=train.index)

In [46]:
result=pd.concat([df,train],axis=1)

In [55]:
result.drop(['sentiments'],axis=1,inplace=True)

In [63]:
result.loc[result['Polarity']<0.3,'sentiment']='Negative'
result.loc[result['Polarity']>=0.3,'sentiment']='Positive'

In [None]:
result

In [None]:
df_hate_speech=result.loc[result['Polarity']<0.3]
df_hate_speech