In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
import fuzzywuzzy
from fuzzywuzzy import fuzz
from  nltk import word_tokenize
import torch.optim as optim



# Preprocessing

In [3]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [4]:
# train dataset preprocess

f = "english/agr_en_train.csv"

# preprocessing english tweets.
#ingesting english csv file
df = pd.read_csv(f,names = ['source','comment','annotation'],encoding='UTF-8')
df['comment'] = df.comment.str.strip()   # removing spaces

comments = np.asarray(df['comment'])    # dividing the dataframe into comments and tags and converting to array
tags = np.asarray(df['annotation'])
print((len(comments)))
print(len(tags))

stop_words = set(stopwords.words('english'))  #english stop words list
processed_tokens = []
for comment in comments:
#    comment = "Also see ....hw ur RSS activist caught in Burkha .... throwing beef in d holy temples...https://www.google.co.in/amp/www.india.com/news/india/burkha-clad-rss-activist-caught-throwing-beef-at-temple-pictures-go-viral-on-facebook-593154/amp/,NAGfacebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING"
    comment = comment.lower()   #lower casing each tweets
    Digit_REMOVAL = re.sub(r'[0-9]+', '',comment) #removal of numbers 
    URL_REMOVAL = re.sub(r"http\S+", "", Digit_REMOVAL) # removal of URLS
    tokenizer = nltk.RegexpTokenizer(r"\w+")   # removal of punctuation and tokenizing
    new_words = tokenizer.tokenize(URL_REMOVAL)
    sentence = []
    for word in new_words:
        if word not in stop_words:           #checking for stop words on each sentence
            sentence.append(word)
    processed_tokens.append(sentence)

11999
11999


In [5]:
df['comment']

0        Well said sonu..you have courage to stand agai...
1        Most of Private Banks ATM's Like HDFC, ICICI e...
2           Now question is, Pakistan will adhere to this?
3        Pakistan is comprised of fake muslims who does...
4        ??we r against cow slaughter,so of course it w...
                               ...                        
11994    They belong to you flight dirty terrorist coun...
11995    Really motivating programme, congratulations t...
11996                                      fabricated news
11997                 What's wrong with you secular idiots
11998    Looks like inevitable after all political hard...
Name: comment, Length: 11999, dtype: object

In [6]:
processed_tokens

[['well', 'said', 'sonu', 'courage', 'stand', 'dadagiri', 'muslims'],
 ['private',
  'banks',
  'atm',
  'like',
  'hdfc',
  'icici',
  'etc',
  'cash',
  'public',
  'sector',
  'bank',
  'atm',
  'working'],
 ['question', 'pakistan', 'adhere'],
 ['pakistan',
  'comprised',
  'fake',
  'muslims',
  'know',
  'meaning',
  'unity',
  'imposes',
  'thoughts',
  'others',
  'rascals',
  'gathered'],
 ['r',
  'cow',
  'slaughter',
  'course',
  'stop',
  'leather',
  'manufacturing',
  'happens'],
 ['wondering',
  'educated',
  'ambassador',
  'struggling',
  'pay',
  'credit',
  'debit',
  'decent',
  'restaurant',
  'cant',
  'imagine',
  'diplomat',
  'developed',
  'nation',
  'card',
  'needs',
  'cash',
  'dinner'],
 ['inflation', 'react', 'shocks', 'demon'],
 ['good', 'job', 'guis', 'creating', 'problem', 'n', 'socacity'],
 ['false',
  'news',
  'indian',
  'media',
  'simply',
  'misguiding',
  'nation',
  'creating',
  'hatred',
  'media',
  'v',
  'careful',
  'spreading',
  'new

In [7]:
#-----------------For hinglish dataset


Hindi_text  = "hindi/agr_hi_dev.csv"
df1 = pd.read_csv(Hindi_text,names = ['source','comment','annotation'],encoding='UTF-8')
df1['comment'] = df1.comment.str.strip()   # removing spaces
hindi_comments = np.asarray(df1['comment'])    # dividing the dataframe into comments and tags and converting to array
hindi_tags = np.asarray(df1['annotation'])
print((hindi_comments[1])) 
processed_Hindi_tokens = []
for comment in hindi_comments:
#    comment = "Also see ....hw ur RSS activist caught in Burkha .... throwing beef in d holy temples...https://www.google.co.in/amp/www.india.com/news/india/burkha-clad-rss-activist-caught-throwing-beef-at-temple-pictures-go-viral-on-facebook-593154/amp/,NAGfacebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING"
    comment = comment.lower()   #lower casing each tweets
    Digit_REMOVAL = re.sub(r'[0-9]+', '',comment) #removal of numbers 
    URL_REMOVAL = re.sub(r"http\S+", "", Digit_REMOVAL) # removal of URLS
    Emoji_removal = remove_emoji(URL_REMOVAL)
    if (isEnglish(Emoji_removal) == True):
        Emoji_removal = re.sub(r'[^\w\s]','',Emoji_removal)# removal of punctuation and tokenizing
    processed_Hindi_tokens.append(word_tokenize(Emoji_removal))

First stage par dus jootey khaye Grover  se


In [8]:
processed_Hindi_tokens

[['randtv',
  'tumhare',
  'najayaz',
  'baap',
  'is',
  'area',
  'hai',
  'ki',
  'waha',
  'koi',
  'nahi',
  'has',
  'sakta',
  'haraami',
  'azad',
  'mulk',
  'hai',
  'sab',
  'jagah',
  'jayenge'],
 ['first', 'stage', 'par', 'dus', 'jootey', 'khaye', 'grover', 'se'],
 ['salman',
  'aur',
  'aamir',
  'ki',
  'kounsi',
  'movie',
  'release',
  'huyee',
  'jo',
  'aandhi',
  'me',
  'dub',
  'gaye',
  'bikau',
  'chatukar',
  'media'],
 ['wk',
  'aur',
  'bhakt',
  'aa',
  'gya',
  'abe',
  'americans',
  'ka',
  'bolna',
  'h',
  'ki',
  'indians',
  'k',
  'karan',
  'wo',
  'unemployed',
  'hain',
  'muslim',
  'ho',
  'hindu',
  'ho',
  'ya',
  'christian',
  'ho',
  'andhbhakto',
  'ko',
  'dharm',
  'k',
  'alaw',
  'aur',
  'kuch',
  'dikhta',
  'hi',
  'nahi'],
 ['beta',
  'to',
  'tu',
  'apne',
  'baap',
  'ka',
  'hai',
  'permission',
  'to',
  'tu',
  'maang',
  'he',
  'liya',
  'ab',
  'mukr',
  'mat',
  'smjaaaa',
  'yehi',
  'adat',
  'tumlogo',
  'ki',
  'dag

In [9]:
#-----------Transliteration and translation
transliteration_dict = "transliterations.hi-en.csv"
t_dict = pd.read_csv(transliteration_dict,names = ['Hinglish','Hindi'],encoding='UTF-8',sep='\t')
t_dict['Hinglish'] = t_dict['Hinglish'].str.strip()
t_dict['Hindi'] = t_dict['Hindi'].str.strip()
t_dict = np.asarray(t_dict)

In [10]:
t_dict

array([['hajagiree', 'हजगिरी'],
       ['chekaanv', 'चेकॉव'],
       ['spinagaarn', 'स्पिनगार्न'],
       ...,
       ['bar', 'वार'],
       ['leonard', 'लियोनार्ड'],
       ['gurudwar', 'गुरूद्वारा']], dtype=object)

In [11]:
#--------------profanity dictionary
profanity_dict = "ProfanityText.txt"
P_dict = pd.read_csv(profanity_dict,names = ['Hinglish','English'],encoding='UTF-8',sep='\t')
P_dict['Hinglish'] = P_dict['Hinglish'].str.strip()
P_dict['English'] = P_dict['English'].str.strip()
P_dict = np.asarray(P_dict)

In [12]:
P_dict

array([['badir', 'idiot'],
       ['badirchand', 'idiot'],
       ['bakland', 'idiot'],
       ['bhadva', 'pimp'],
       ['bhootnika', 'son of a witch'],
       ['chinaal', 'whore'],
       ['chup', 'shut up'],
       ['chutia', 'fucker'],
       ['ghasti', 'hooker'],
       ['chutiya', 'fucker'],
       ['haraami', 'bastard'],
       ['haraam', 'bastard'],
       ['hijra', 'transsexual'],
       ['hinjda', 'transsexual'],
       ['jaanvar', 'animal'],
       ['kutta', 'dog'],
       ['kutiya', 'bitch'],
       ['khota', 'donkey'],
       ['auladheen', 'sonless'],
       ['jaat', 'breed'],
       ['najayaz', 'illegitimate'],
       ['gandpaidaish', 'badborn'],
       ['saala', 'sister’s husband'],
       ['kutti', 'bitch'],
       ['soover', 'swine'],
       ['tatti', 'shit'],
       ['potty', 'shit'],
       ['bahenchod', 'sister fucker'],
       ['bahanchod', 'sister fucker'],
       ['bahencho', 'sister fucker'],
       ['bancho', 'sister fucker'],
       ['bahenke', 'sister’s'],
 

In [13]:
#------------------Translation of hindi text back to english-------

Hindi_dict = "Hindi_English_Dict.csv"
H_dict = pd.read_csv(Hindi_dict,names = ['Hindi','English'],encoding='UTF-8')
H_dict['Hindi'] = H_dict['Hindi'].str.strip()
H_dict['English'] = H_dict['English'].str.strip()
H_hindi = np.asarray(H_dict['Hindi'])
H_english = np.asarray(H_dict['English'])

HE_dict_F = "HE_dictionary_functions.csv"
H_dict_F = pd.read_csv(HE_dict_F,names = ['Hindi','English'],encoding='UTF-8')
H_dict_F['Hindi'] = H_dict_F['Hindi'].str.strip()
H_dict_F['English'] = H_dict_F['English'].str.strip()
H_hindi_F = np.asarray(H_dict_F['Hindi'])
H_english_F = np.asarray(H_dict_F['English'])

HE_dict = dict(zip(H_hindi,H_english))
H_dict_F = dict(zip(H_hindi_F,H_english_F))

EH_dict = {v:k for k, v in HE_dict.items()}
EH_dict_F = {v:k for k, v in H_dict_F.items()}

# convert any hindi word which can be converted to english
for i in range(0,len(processed_Hindi_tokens)):
#     print(i)
    for j in range (0,len(processed_Hindi_tokens[i])):
        Str = processed_Hindi_tokens[i][j]
        if(Str in HE_dict):
            processed_Hindi_tokens[i][j] = HE_dict[Str]
        elif(Str in H_dict_F):
            processed_Hindi_tokens[i][j] = H_dict_F[Str]

In [25]:
H_dict_F

{'मैं': 'I',
 'मुझे': 'I',
 'मुझको': 'I',
 'हम': 'we',
 'हमें': 'we',
 'तुम': 'you',
 'तू': 'you',
 'तुम्हें': 'you',
 'तुझको': 'you',
 'तुझे': 'you',
 'तुमको': 'you',
 'आप': 'you',
 'आपको': 'you',
 'आपसे': 'you',
 'वह': 'that',
 'वे': 'those',
 'वो': 'those',
 'उन्हें': 'they',
 'उनको': 'they',
 'यह': 'this',
 'इसे': 'it',
 'इन्हें': 'she',
 'उसे': 'she',
 'इसको': 'it',
 'उसको': 'she',
 'हमसे': 'we',
 'तुमसे': 'you',
 'उन लोगों को': 'they',
 'इनको': 'they',
 'हमको': 'we',
 'उनसे': 'they',
 'इनसे': 'they',
 'मेर्': 'I',
 'अपन्': 'we',
 'हमार्': 'we',
 'तुम्हार्': nan,
 'तुम लोगों का': 'you',
 'तेर्': 'you',
 'अपने': 'she',
 'आपक्': 'you',
 'उसक्': 'she',
 'उनकी': 'she',
 'उनके': 'they',
 'इनकी': 'she',
 'इनक्': 'they',
 'इसक्': 'it',
 'उनक्': 'they',
 'इनके': 'they',
 'इन लोगों का': 'they',
 'मेरा': 'I',
 'हमारा': 'we',
 'स्वयं': 'it',
 'खुद': 'I',
 'अपने आप': 'one',
 'अपने से': 'she',
 'स्वयं से': 'one',
 'अपनेसे': 'it',
 'स्वतः': nan,
 'ये': 'these',
 'कोई': 'somebody',
 'कोई भी': 'a

In [28]:
HE_dict

{'११': 'xi',
 '१२': '12',
 '१३०': '130',
 'समर्थक-वर्ग': 'lobby',
 '१५००': '1500',
 '१५५': '155',
 '१५': '15',
 '१८६५': '1865',
 '१८६': '186',
 '१८९': '189',
 '१८ वाँ': '18th',
 '१९८१': '1981',
 '१९९८': '1998',
 '१९९९': '1999',
 '१': '1',
 '२०००': '2000',
 '२००१': '2001',
 '२००२': '2002',
 '२००३': '2003',
 '२००४': '2004',
 '२०': '20',
 'बीस पेन्स का सिक्का': 'twenty p',
 '२२': '22',
 '२३': '23',
 '२४': '24',
 '२५': '25',
 '२६': '26',
 '२९': '29',
 '२९ वाँ': '29th',
 '२': '2',
 'दो पेन्स का सिक्का': '2p',
 '३५': '35',
 '३९': '39',
 '३': '3',
 'तीसरा': 'tertiary',
 '४००': '400',
 '४५': '45',
 '४': '4',
 '५००': '500',
 '५८': '58',
 '५': '5',
 'पांच पेन्स का सिक्का': 'five p',
 '६८': '68',
 '७१': '71',
 '७': '7th',
 '८०': '80',
 '८३': '83',
 '९८': '98',
 'शानदार': 'stellar',
 'बहुत ही अच्छा': 'ace',
 'ए ए': 'AA',
 'नवजात शिशु': 'newborn',
 'चकित': 'shock',
 'गौण चालक': 'backseat driver',
 'गिनतारा': 'abacus',
 'बहुत ही दुखद बात': 'a bad dream',
 'दुष्ट व्यक्ति': 'fiend',
 'समुद्री यात्रा न

In [15]:
processed_Hindi_tokens[0]

['randtv',
 'tumhare',
 'najayaz',
 'baap',
 'is',
 'area',
 'hai',
 'ki',
 'waha',
 'koi',
 'nahi',
 'has',
 'sakta',
 'haraami',
 'azad',
 'mulk',
 'hai',
 'sab',
 'jagah',
 'jayenge']

In [16]:
for i in range(0,len(processed_Hindi_tokens)):
    if i == 51:
        break
    print(i)
    for j in range (0,len(processed_Hindi_tokens[i])):
        flag = 0
        Str1 = (processed_Hindi_tokens[i][j])
        max_ratio = 60
        max_ratio_P = 75   #needs to be adjusted
        if (Str1 in EH_dict): # check whether the values exists in english dictionary or not.
            continue;
        for l in range(0,len(P_dict)):
            Str2 = P_dict[l][0]
            Ratiostr1 = fuzz.ratio(Str1,Str2)
            if (Ratiostr1 >= max_ratio_P):
                print(Ratiostr1)
                max_ratio_P = Ratiostr1
                processed_Hindi_tokens[i][j] = P_dict[l][1]
                flag = 1 
                print(flag)
                break;
        for p in EH_dict_F:
            Rationstr1 = fuzz.ratio(Str1,str(p))
            if(Ratiostr1 >= 98):
                flag = 1
                break;
        if (flag == 1):
            continue;
        
        else:
            for k in range(0,len(t_dict)):
                Str2 = t_dict[k][0]
                Ratiostr1 = fuzz.ratio(Str1,Str2)
                if (Ratiostr1 > max_ratio):
                    max_ratio = Ratiostr1
                    processed_Hindi_tokens[i][j] = t_dict[k][1]

0
100
1
100
1
1
2
3
80
1
4
75
1
75
1
5
80
1
77
1
6
7
8
75
1
9
10
100
1
100
1
100
1
100
1
11
91
1
75
1
75
1
80
1
75
1
100
1
75
1
100
1
75
1
75
1
80
1
75
1
12
75
1
80
1
13
77
1
14
80
1
15
16
80
1
75
1
80
1
80
1
75
1
17
18
19
20
21
80
1
80
1
80
1
80
1
75
1
80
1
22
23
89
1
80
1
75
1
80
1
89
1
24
25
83
1
80
1
80
1
26
27
80
1
80
1
28
89
1
75
1
80
1
29
75
1
30
31
32
92
1
33
75
1
34
35
36
80
1
80
1
91
1
100
1
37
75
1
75
1
75
1
75
1
38
39
75
1
40
77
1
41
89
1
42
75
1
75
1
43
75
1
89
1
44
45
80
1
46
47
80
1
100
1
48
49
80
1
50


In [17]:
for i in range(0,len(processed_Hindi_tokens)):
#     print(i)
    for j in range (0,len(processed_Hindi_tokens[i])):
        Str = processed_Hindi_tokens[i][j]
        if(Str in HE_dict):
            processed_Hindi_tokens[i][j] = HE_dict[Str]
        elif(Str in H_dict_F):
            processed_Hindi_tokens[i][j] = H_dict_F[Str]

In [18]:
processed_Hindi_tokens[0]

['रॉड',
 'तुम्हारे',
 'illegitimate',
 'father',
 'have',
 'area',
 'have',
 'की',
 'स्वाहा',
 'की',
 'other than',
 'हैट्स',
 'सांता',
 'bastard',
 'आज़ाद',
 'माल्क',
 'have',
 'sub',
 'जयगढ़',
 'जाएगा']

# Model

In [19]:
#------------------------------MODEL           

import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F

In [20]:
cuda = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')

torch.manual_seed(42)

<torch._C.Generator at 0x2a26b639cf0>

In [21]:
class MIMCT(nn.Module):   
    def __init__(self,input_channel,output_channel,embedding_dim,hidden_dim,kernel_size,feature_linear):
        super(MIMCT, self).__init__()
        self.CNN_Layers = nn.Sequential( 
            nn.Conv1d(input_channel, output_channel,kernel_size[0], stride=1),
            nn.Conv1d(input_channel, output_channel, kernel_size[1], stride=1),
            nn.Conv1d(input_channel, output_channel, kernel_size[2], stride=1),
            nn.Flatten(),nn.Dropout(p=0.25),
            nn.Linear(feature_linear, 3),
            nn.Softmax()
            )
        #create a sequential for LSTM.
        self.LSTM_Layers = nn.Sequential(
            nn.LSTM(embedding_dim, hidden_dim, output_channel,dropout),
            nn.Linear(hidden_dim, 3),
            nn.Dropout(p=0.20),
            nn.Softmax()
        )
    
    def forward(self,x):
        x = self.CNN_Layers(x)
      #  y = self.LSTM_Layers(x)
        #concat the outputs the compile layer with categorical cross-entropy the loss function,
       # print(y)
        return x