<a href="https://colab.research.google.com/github/sayarghoshroy/Hate-Speech-Detection/blob/master/HASOC_tweet_engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import csv
import re

In [2]:
# Uncomment if you're running it for the first time
# !pip install ekphrasis
from ekphrasis.classes.segmenter import Segmenter
# to leverage word statistics from Twitter
seg_tw = Segmenter(corpus = "twitter")

Reading twitter - 1grams ...
Reading twitter - 2grams ...


In [3]:
# Uncomment if you're running it for the first time
# !pip install tweet-preprocessor
import preprocessor as tweet_proc

In [4]:
# Uncomment if you're running it for the first time
# !pip install emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

#### *Raw Datasets are hosted [here](https://drive.google.com/drive/folders/1TuHRQQ41lK9oXJhlhsRiMjMnczuKn_kF?usp=sharing).*

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [6]:
def make_list(proc_obj):
  if proc_obj == None:
    return []
  
  store = []
  for unit in proc_obj:
    store.append(unit.match)
  
  return store

In [7]:
def emotext(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",", "").replace(":", "").split()))
    return text

In [8]:
# For English
file_name = "/content/drive/My Drive/HASOC_raw_data/2019_data/english_dataset/english_dataset/english_dataset.tsv"

# For Hindi
# file_name = "/content/drive/My Drive/HASOC_raw_data/2019_data/hindi_dataset/hindi_dataset/hindi_dataset.tsv"

# For German
# file_name = "/content/drive/My Drive/HASOC_raw_data/2019_data/german_dataset/german_dataset/german_dataset.tsv"

# Set `german` as True while using the German dataset
# It will notify dataset creater that there is no task 3
german = False

datapoints_count = 0
see_index = True

tweets = []
raw_tweet_texts = []

hashtags = []
smileys = []
emojis = []
urls = []
mentions = []
numbers = []
reserveds = []

task_1_labels = []
task_2_labels = []
task_3_labels = []

with open(file_name) as file:
    file_reader = csv.reader(file, delimiter = "\t")
    for line in file_reader:
      if see_index == True:
        see_index = False
        continue
      
      datapoints_count += 1

      task_1_labels.append(line[2])
      task_2_labels.append(line[3])
      if german != True:
        task_3_labels.append(line[4])
      
      tweets.append(line[1])
      raw_tweet_texts.append(tweet_proc.clean(line[1]))

      parse_obj = tweet_proc.parse(line[1])

      hashtags.append(make_list(parse_obj.hashtags))
      smileys.append(make_list(parse_obj.smileys))
      emojis.append(make_list(parse_obj.emojis))
      urls.append(make_list(parse_obj.urls))
      mentions.append(make_list(parse_obj.mentions))
      numbers.append(make_list(parse_obj.numbers))
      reserveds.append(make_list(parse_obj.reserved))

print("Number of Datapoints: " + str(datapoints_count))

Number of Datapoints: 5852


In [9]:
# Viewing Created Dataset

print("Tweets:")
print(tweets[0: 5])

print("Raw Texts:")
print(raw_tweet_texts[0: 5])

print("Hashtags:")
print(hashtags[0: 5])

print("Smileys:")
print(smileys[0: 5])

print("Emojis:")
print(emojis[0: 5])

print("Urls:")
print(urls[0: 5])

print("Mentions:")
print(mentions[0: 5])

print("Numbers:")
print(numbers[0: 5])

print("Reserved Words:")
print(reserveds[0: 5])

print("Task Labels:")
print(task_1_labels[0: 5])
print(task_2_labels[0: 5])
if german != True:
  print(task_3_labels[0: 5])

Tweets:
["#DhoniKeepsTheGlove | WATCH: Sports Minister Kiren Rijiju issues statement backing MS Dhoni over 'Balidaan Badge', tells BCCI to take up the matter with ICC and keep government in the know as nation's pride is involved    https://t.co/zuo5335Rjr", '@politico No. We should remember very clearly that #Individual1 just admitted to treason . #TrumpIsATraitor  #McCainsAHero #JohnMcCainDay', '@cricketworldcup Guess who would be the winner of this #CWC19?     Team who gets maximum points from the abandoned matches 😄 #ShameOnICC #WIvsENG @ICC', "Corbyn is too politically intellectual for #BorisJohnsonShouldNotBePM   Can't wait   #GeneralElectionNow https://t.co/pt8KmjfxJj", 'All the best to #TeamIndia for another swimming competition on Sunday against #Pakistan.     #INDvPAK #ShameOnICC  #CWC19 #CWC19Rains ☔☔ https://t.co/MG2cIE0zib']
Raw Texts:
["| WATCH: Sports Minister Kiren Rijiju issues statement backing MS Dhoni over 'Balidaan Badge', tells BCCI to take up the matter with ICC a

In [10]:
# Generating Emoji Texts
emoji_texts = []

for emo_list in emojis:
  texts = []
  for emoji in emo_list:
    text = emotext(emoji)
    texts.append(text)
  emoji_texts.append(texts)

print("Emoji Descriptions:")
print(emoji_texts[0: 5])

Emoji Descriptions:
[[], [], ['smiling_face_with_open_mouth_&_smiling_eyes'], [], ['umbrella_with_rain_drops', 'umbrella_with_rain_drops']]


In [11]:
# Segmenting Hashtags
segmented_hashtags = []

for hashset in hashtags:
  segmented_set = []
  for tag in hashset:
    word = tag[1: ]
    # removing the hash symbol
    segmented_set.append(seg_tw.segment(word))
  segmented_hashtags.append(segmented_set)

print("Segmented Hashtags:")
print(segmented_hashtags[0: 5])

Segmented Hashtags:
[['dhoni keeps the glove'], ['individual 1', 'trump is a traitor', 'mc cains a hero', 'john mc cain day'], ['cwc 19', 'shame on icc', 'w ivs eng'], ['boris johnson should not be pm', 'general election now'], ['team india', 'pakistan', 'in dv pak', 'shame on icc', 'cwc 19', 'cwc 19 rains']]


#### **Comments**

#### *English*

- Everything Works Fine
- In hashtag segmentation, things like "WIvsIND" get are misrepresented

#### *Hindi*

- Raw text field does not capture anything
- Hashtags, Mentions, and URLs are captured nicely

#### *German*

- No data for task 3

In [12]:
# ^_^ Thank You