In [1]:
#I'm going to import pandas so that I can use the pd.read_csv() function and organize my data as a dataframe.
#I prefer the dataframe approach for its readablility.
import pandas as pd
import numpy as np
import nltk
#I'm importing the Naive Bayes Classifier from nltk for clarity in my code.
from nltk.classify import NaiveBayesClassifier
from nltk.metrics.scores import accuracy, recall, f_measure
#Matplot is for future use and won't be used here.
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [2]:
#I imported the .csv file. I also gave headers to each of the 3 columns, and used the first column, 'Unique ID', as the index.
training_data = pd.read_csv("agr_en_train.csv", names = ['Index', 'Text', 'Agression'], index_col = 0)
#I broke up the dataframe according to aggression level.
CAG_data = training_data[training_data['Agression'] == 'CAG']
# print(CAG_data)
OAG_data = training_data[training_data['Agression'] == 'OAG']
# print(OAG_data)
NAG_data = training_data[training_data['Agression'] == 'NAG']
# print(NAG_data)
print(training_data.info())
training_data

<class 'pandas.core.frame.DataFrame'>
Index: 11999 entries, facebook_corpus_msr_1723796 to facebook_corpus_msr_327960
Data columns (total 2 columns):
Text         11999 non-null object
Agression    11999 non-null object
dtypes: object(2)
memory usage: 281.2+ KB
None


Unnamed: 0_level_0,Text,Agression
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
facebook_corpus_msr_1723796,Well said sonu..you have courage to stand agai...,OAG
facebook_corpus_msr_466073,"Most of Private Banks ATM's Like HDFC, ICICI e...",NAG
facebook_corpus_msr_1493901,"Now question is, Pakistan will adhere to this?",OAG
facebook_corpus_msr_405512,Pakistan is comprised of fake muslims who does...,OAG
facebook_corpus_msr_1521685,"??we r against cow slaughter,so of course it w...",NAG
facebook_corpus_msr_462570,Wondering why Educated Ambassador is strugglin...,CAG
facebook_corpus_msr_465051,How does inflation react to all the after shoc...,NAG
facebook_corpus_msr_450994,Not good job.....this guis creating a problem ...,CAG
facebook_corpus_msr_326287,This is a false news Indian media is simply mi...,NAG
facebook_corpus_msr_430450,"no permanent foes, no permanent friends. inter...",NAG


In [3]:
# I'm using drop to delete the row in Hindi
# I'm importing regex, so I can remove punctuation and letters/numbers.
import re
OAGTextNoPunc = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i) for i in OAG_data.loc[:, 'Text']]
NAGTextNoPunc = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i) for i in NAG_data.loc[:, 'Text']]
CAGTextNoPunc = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i) for i in CAG_data.loc[:, 'Text']]

In [4]:
#I'm switching everything to lowercase so that when I check for word frequencies and unique words, I won't get erroneous hits.
OAGLowercase_Text = [x.lower() for x in OAGTextNoPunc]
NAGLowercase_Text = [x.lower() for x in NAGTextNoPunc]
CAGLowercase_Text = [x.lower() for x in CAGTextNoPunc]
print(OAGLowercase_Text)
# print(NAGLowercase_Text)
# print(CAGLowercase_Text)



In [5]:
#Here, the text is tokenized for each user.
OAG_Tokenized = [nltk.word_tokenize(i) for i in OAGLowercase_Text]
NAG_Tokenized = [nltk.word_tokenize(i) for i in NAGLowercase_Text]
CAG_Tokenized = [nltk.word_tokenize(i) for i in CAGLowercase_Text]
print(OAG_Tokenized)



In [6]:
#Now, I'm checking the most common words for each aggression level.
#To do this, I've joined all the strings in the list together and tokenized them.
OAGFull = ' '.join(OAGLowercase_Text)
OAGFullTokens = nltk.word_tokenize(OAGFull)
OAGFull_FreqDist = nltk.FreqDist(OAGFullTokens).most_common()[:10]
print(OAGFull_FreqDist)

[('the', 2176), ('to', 1810), ('and', 1793), ('is', 1670), ('of', 1592), ('in', 1410), ('a', 1241), ('you', 786), ('are', 733), ('for', 686)]


In [7]:
NAGFull = ' '.join(NAGLowercase_Text)
NAGFullTokens = nltk.word_tokenize(NAGFull)
NAGFull_FreqDist = nltk.FreqDist(NAGFullTokens).most_common()[:10]
print(NAGFull_FreqDist)

[('the', 3346), ('to', 2646), ('of', 2224), ('and', 2074), ('is', 1997), ('in', 1877), ('a', 1393), ('for', 1293), ('it', 1003), ('i', 920)]


In [8]:
CAGFull = ' '.join(CAGLowercase_Text)
CAGFullTokens = nltk.word_tokenize(CAGFull)
CAGFull_FreqDist = nltk.FreqDist(CAGFullTokens).most_common()[:10]
print(CAGFull_FreqDist)

[('the', 2961), ('to', 2744), ('and', 2241), ('is', 2153), ('of', 2004), ('in', 1664), ('a', 1638), ('for', 1037), ('are', 1023), ('not', 1011)]


In [9]:
#Here, I'm checking all the bigrams for each aggression level.
OAG_bgrmlist = list(nltk.bigrams(OAGFull.split()))
NAG_bgrmlist = list(nltk.bigrams(NAGFull.split()))
CAG_bgrmlist = list(nltk.bigrams(CAGFull.split()))
print(CAG_bgrmlist)



In [10]:
#Here, I'm checking all the trigrams for each aggression level.
OAG_trgrmlist = list(nltk.trigrams(OAGFull.split()))
NAG_trgrmlist = list(nltk.trigrams(NAGFull.split()))
CAG_trgrmlist = list(nltk.trigrams(CAGFull.split()))
print(CAG_trgrmlist)



In [11]:
#Lastly, I've checked for unique words in each aggression level rather than by individual users.
OAG_Unique_Words = set(OAGFullTokens)
CAG_Unique_Words = set(CAGFullTokens)
NAG_Unique_Words = set(NAGFullTokens)
print(NAG_Unique_Words)

{'britishers', 'follower', 'fools', 'satisfy', 'buyers', 'retained', 'understands', 'gona', 'yedyurappa', 'booker', 'chamcash', 'intention', 'expected', 'vacation', 'years', 'boxer', 'eventually', 'hahahahahahah', 'takes', 'otherside', 'lossesn', 'grow', 'wholle', 'isliye', 'unemployement', 'surgeon', 'photos', 'tex', 'tidings', 'interesting', 'shakshi', 'childrens', 'pork', 'constructed', 'porkistani', 'among', 'traffic', 'cyril', 'ur', 'allegedly', 'ninda', 'marries', 'aspataal', 'nearly', 'petting', 'sahyad', 'jpmorgan', 'manisha', 'earning', 'rise', 'capped', 'lilove', 'yatra', 'shivagami', '141', 'poly', 'mlo3cshdzds', 'yeah', 'mal', 'dent', 'houses', 'corruption', 'responsibilities', 'flush', 'daily', 'slurrping', 'navab', 'richi', 'virtual', 'pakistanan', 'massively', 'elect', 'infested', 'todays', 'bangalore', 'naxalites', 'without', '120', 'considered', '5rs', 'trople', 'verdict', 'purpose', 'biscuits', 'luk', '915', 'unethically', 'physically', 'sleepless', 'financiers', 'sic

In [12]:
OAG_Tokenized

[['well',
  'said',
  'sonu',
  'you',
  'have',
  'courage',
  'to',
  'stand',
  'against',
  'dadagiri',
  'of',
  'muslims'],
 ['now', 'question', 'is', 'pakistan', 'will', 'adhere', 'to', 'this'],
 ['pakistan',
  'is',
  'comprised',
  'of',
  'fake',
  'muslims',
  'who',
  'does',
  'not',
  'know',
  'the',
  'meaning',
  'of',
  'unity',
  'and',
  'imposes',
  'their',
  'thoughts',
  'on',
  'others',
  'all',
  'the',
  'rascals',
  'have',
  'gathered',
  'there'],
 ['communist',
  'parties',
  'killed',
  'lacks',
  'of',
  'opponents',
  'in',
  'wb',
  'in',
  '35',
  'years',
  'ruling'],
 ['rss', 'is', '3', 'time', 'ban', 'terrorist', 'organization'],
 ['so', 'funny', 'stupid'],
 ['aap', 'dont', 'need', 'the', 'monsters', 'like', 'u'],
 ['oh',
  'pak',
  'army',
  'or',
  'should',
  'say',
  'porki',
  'hijada',
  'army',
  'whose',
  'country',
  'didn',
  't',
  'excepted',
  'the',
  'bodies',
  'of',
  'their',
  'dickless',
  'coward',
  'army',
  'during',
  'k

In [13]:
def extract_word_feats(words):
    dict = {}
    for word in words:
        dict[word] = True
    return dict
# OAGdict = extract_word_feats(OAG_Tokenized[i])
print(len(OAG_Tokenized))
print(len(NAG_Tokenized))
print(len(CAG_Tokenized))

2708
5051
4240


In [14]:
OAGdictList = [(extract_word_feats(OAG_Tokenized[i]), 'OAG') for i in range(2708)]
NAGdictList = [(extract_word_feats(NAG_Tokenized[i]), 'NAG') for i in range(5051)]
CAGdictList = [(extract_word_feats(CAG_Tokenized[i]), 'CAG') for i in range(4240)]
OAGdictList

[({'well': True,
   'said': True,
   'sonu': True,
   'you': True,
   'have': True,
   'courage': True,
   'to': True,
   'stand': True,
   'against': True,
   'dadagiri': True,
   'of': True,
   'muslims': True},
  'OAG'),
 ({'now': True,
   'question': True,
   'is': True,
   'pakistan': True,
   'will': True,
   'adhere': True,
   'to': True,
   'this': True},
  'OAG'),
 ({'pakistan': True,
   'is': True,
   'comprised': True,
   'of': True,
   'fake': True,
   'muslims': True,
   'who': True,
   'does': True,
   'not': True,
   'know': True,
   'the': True,
   'meaning': True,
   'unity': True,
   'and': True,
   'imposes': True,
   'their': True,
   'thoughts': True,
   'on': True,
   'others': True,
   'all': True,
   'rascals': True,
   'have': True,
   'gathered': True,
   'there': True},
  'OAG'),
 ({'communist': True,
   'parties': True,
   'killed': True,
   'lacks': True,
   'of': True,
   'opponents': True,
   'in': True,
   'wb': True,
   '35': True,
   'years': True,
   

In [15]:
OAGsplit = int(len(OAGdictList)*0.75)
NAGsplit = int(len(NAGdictList)*0.75)
CAGsplit = int(len(CAGdictList)*0.75)

In [16]:
trainingset = OAGdictList[:OAGsplit] + NAGdictList[:NAGsplit] + CAGdictList[:CAGsplit]
testset = OAGdictList[OAGsplit:] + NAGdictList[NAGsplit:] + CAGdictList[CAGsplit:]

In [17]:
print('train on %d instances, test on %d instances' % (len(trainingset), len(testset)))
FirstNBclassifier = NaiveBayesClassifier.train(trainingset)

train on 8999 instances, test on 3000 instances


In [18]:
print('accuracy:', nltk.classify.util.accuracy(FirstNBclassifier, testset))

accuracy: 0.45666666666666667


In [19]:
FirstNBclassifier.show_most_informative_features()

Most Informative Features
                   nifty = True              NAG : CAG    =     33.3 : 1.0
                   azaan = True              OAG : NAG    =     30.5 : 1.0
                   noise = True              OAG : NAG    =     30.5 : 1.0
                  singer = True              OAG : NAG    =     30.5 : 1.0
                 useless = True              OAG : NAG    =     28.0 : 1.0
                 singing = True              OAG : NAG    =     21.8 : 1.0
             journalists = True              OAG : NAG    =     21.8 : 1.0
              guidelines = True              OAG : NAG    =     20.5 : 1.0
                   islam = True              OAG : NAG    =     20.5 : 1.0
                    tata = True              NAG : OAG    =     19.5 : 1.0


In [20]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [21]:
OAGTokensWOStop = [(w for w in OAG_Tokenized[i] if not w in stop_words) for i in range(2708)]

In [22]:
# I received help in removing stopwords from jayprakashstar on https://stackoverflow.com/questions/55773928/how-to-remove-stop-words-from-list-of-lists-in-python
OAGTokensWOStop = []
for tweet in OAG_Tokenized:
    temp = []
    for word in tweet:
        if word not in stop_words:
            temp.append(word)
    OAGTokensWOStop.append(temp)

In [23]:
OAGTokensWOStop

[['well', 'said', 'sonu', 'courage', 'stand', 'dadagiri', 'muslims'],
 ['question', 'pakistan', 'adhere'],
 ['pakistan',
  'comprised',
  'fake',
  'muslims',
  'know',
  'meaning',
  'unity',
  'imposes',
  'thoughts',
  'others',
  'rascals',
  'gathered'],
 ['communist',
  'parties',
  'killed',
  'lacks',
  'opponents',
  'wb',
  '35',
  'years',
  'ruling'],
 ['rss', '3', 'time', 'ban', 'terrorist', 'organization'],
 ['funny', 'stupid'],
 ['aap', 'dont', 'need', 'monsters', 'like', 'u'],
 ['oh',
  'pak',
  'army',
  'say',
  'porki',
  'hijada',
  'army',
  'whose',
  'country',
  'excepted',
  'bodies',
  'dickless',
  'coward',
  'army',
  'kargil',
  'war',
  'army',
  'submitted',
  'infront',
  'indian',
  'army',
  '1971',
  'east',
  'pakistan',
  'became',
  'bangladesh',
  'hijada',
  'army',
  'even',
  'save',
  'school',
  'childrens',
  'merciless',
  'killings',
  'indian',
  'forces',
  'really',
  'reached',
  'lahore',
  '1965'],
 ['want', 'get', 'rid', 'u', 'indi

In [24]:
NAGTokensWOStop = []
for tweet in NAG_Tokenized:
    temp = []
    for word in tweet:
        if word not in stop_words:
            temp.append(word)
    NAGTokensWOStop.append(temp)
NAGTokensWOStop

[['private',
  'banks',
  'atm',
  'like',
  'hdfc',
  'icici',
  'etc',
  'cash',
  'public',
  'sector',
  'bank',
  'atm',
  'working'],
 ['r',
  'cow',
  'slaughter',
  'course',
  'stop',
  'leather',
  'manufacturing',
  'happens'],
 ['inflation', 'react', 'shocks', 'demon'],
 ['false',
  'news',
  'indian',
  'media',
  'simply',
  'misguiding',
  'nation',
  'creating',
  'hatred',
  'media',
  'v',
  'careful',
  'spreading',
  'news',
  'shame'],
 ['permanent', 'foes', 'permanent', 'friends', 'interest', 'permanent'],
 ['deepak',
  'kumar',
  'sharma',
  'saab',
  'chalo',
  'aap',
  'ki',
  'ye',
  'baat',
  'ek',
  'baar',
  'mann',
  'li',
  'whatever',
  'pm',
  'saab',
  'talked',
  '2014',
  'kya',
  'kya',
  'kia',
  'us',
  'main',
  'campaigner',
  'bjp',
  'nothing',
  'else',
  'ek',
  'cheez',
  'bta',
  'implementing',
  'bills',
  'opposed',
  '2014',
  'example',
  'fdi',
  'gst',
  'list',
  'long'],
 ['guys',
  'counter',
  'modi',
  'govt',
  'decisions',
  

In [25]:
CAGTokensWOStop = []
for tweet in CAG_Tokenized:
    temp = []
    for word in tweet:
        if word not in stop_words:
            temp.append(word)
    CAGTokensWOStop.append(temp)
CAGTokensWOStop

[['wondering',
  'educated',
  'ambassador',
  'struggling',
  'pay',
  'credit',
  'debit',
  'decent',
  'restaurant',
  'cant',
  'imagine',
  'diplomat',
  'developed',
  'nation',
  'card',
  'needs',
  'cash',
  'dinner'],
 ['good', 'job', 'guis', 'creating', 'problem', 'n', 'socacity'],
 ['absolutely', 'deeper', 'dive', 'shallower', 'cushion'],
 ['brown',
  'sahib',
  'anti',
  'national',
  'leftist',
  'commies',
  'media',
  'muslim',
  'pandering',
  'hindu',
  'sick',
  'lars',
  'milked',
  'secularism',
  'worth',
  'render',
  'mere',
  'failed',
  'slogan',
  'sick',
  'larism',
  'nothing',
  'continuation',
  'british',
  'policy',
  'divide',
  'rule',
  'rss',
  'opposed',
  'idea',
  'secularism',
  'absolutely',
  'used',
  'dagger',
  'heart',
  'hindus',
  'hinduism',
  'hindustan'],
 ['tht',
  'hv',
  'thn',
  '2',
  'flats',
  '2',
  'land',
  'legal',
  'rest',
  'surrender',
  'properties',
  'govt'],
 ['problem', 'occur', 'india', 'countries'],
 ['governmen

In [26]:
OAGdictListWOStop = [(extract_word_feats(OAGTokensWOStop[i]), 'OAG') for i in range(2708)]
NAGdictListWOStop = [(extract_word_feats(NAGTokensWOStop[i]), 'NAG') for i in range(5051)]
CAGdictListWOStop = [(extract_word_feats(CAGTokensWOStop[i]), 'CAG') for i in range(4240)]

In [27]:
OAGdictListWOStop

[({'well': True,
   'said': True,
   'sonu': True,
   'courage': True,
   'stand': True,
   'dadagiri': True,
   'muslims': True},
  'OAG'),
 ({'question': True, 'pakistan': True, 'adhere': True}, 'OAG'),
 ({'pakistan': True,
   'comprised': True,
   'fake': True,
   'muslims': True,
   'know': True,
   'meaning': True,
   'unity': True,
   'imposes': True,
   'thoughts': True,
   'others': True,
   'rascals': True,
   'gathered': True},
  'OAG'),
 ({'communist': True,
   'parties': True,
   'killed': True,
   'lacks': True,
   'opponents': True,
   'wb': True,
   '35': True,
   'years': True,
   'ruling': True},
  'OAG'),
 ({'rss': True,
   '3': True,
   'time': True,
   'ban': True,
   'terrorist': True,
   'organization': True},
  'OAG'),
 ({'funny': True, 'stupid': True}, 'OAG'),
 ({'aap': True,
   'dont': True,
   'need': True,
   'monsters': True,
   'like': True,
   'u': True},
  'OAG'),
 ({'oh': True,
   'pak': True,
   'army': True,
   'say': True,
   'porki': True,
   'hijada

In [28]:
OAGsplit2 = int(len(OAGdictListWOStop)*0.75)
NAGsplit2 = int(len(NAGdictListWOStop)*0.75)
CAGsplit2 = int(len(CAGdictListWOStop)*0.75)

In [29]:
trainingset2 = OAGdictListWOStop[:OAGsplit2] + NAGdictListWOStop[:NAGsplit2] + CAGdictListWOStop[:CAGsplit2]
testset2 = OAGdictListWOStop[OAGsplit2:] + NAGdictListWOStop[NAGsplit2:] + CAGdictListWOStop[CAGsplit2:]

In [30]:
print('train on %d instances, test on %d instances' % (len(trainingset2), len(testset2)))
SecondNBclassifier = NaiveBayesClassifier.train(trainingset2)

train on 8999 instances, test on 3000 instances


In [31]:
print('accuracy:', nltk.classify.util.accuracy(SecondNBclassifier, testset2))

accuracy: 0.5073333333333333


In [32]:
SecondNBclassifier.show_most_informative_features()

Most Informative Features
                   nifty = True              NAG : CAG    =     33.3 : 1.0
                   azaan = True              OAG : NAG    =     30.5 : 1.0
                   noise = True              OAG : NAG    =     30.5 : 1.0
                  singer = True              OAG : NAG    =     30.5 : 1.0
                 useless = True              OAG : NAG    =     28.0 : 1.0
                 singing = True              OAG : NAG    =     21.8 : 1.0
             journalists = True              OAG : NAG    =     21.8 : 1.0
              guidelines = True              OAG : NAG    =     20.5 : 1.0
                   islam = True              OAG : NAG    =     20.5 : 1.0
                    tata = True              NAG : OAG    =     19.5 : 1.0


In [33]:
from sklearn import metrics


# import collections

# refsets = collections.defaultdict(set)
# testsets = collections.defaultdict(set)
 
# for i, (feats, label) in enumerate(testset2):
#     refsets[label].add(i)
#     observed = SecondNBclassifier.classify(feats)
#     testsets[observed].add(i)
    
#     print('pos precision:', accuracy(refsets, testsets))
#     print('pos recall:', recall(refsets, testsets))
# #     print('pos F-measure:', f_measure(refsets['pos'], testsets['pos']))
# #     print('neg precision:', precision(refsets['neg'], testsets['neg']))
# #     print('neg recall:', recall(refsets['neg'], testsets['neg']))
# #     print('neg F-measure:', f_measure(refsets['neg'], testsets['neg']))


The same sample will be analyzed again using Word2Vec and a Convolutional Neural Network as inspired by "Using Convolutional Neural Networks to Classify Hate-Speech" by Björn Gambäck and Utpal Kumar Sikdar

In [34]:
import keras
# First, I have to clean and organize the data, so that I can split it into a training set and a test set.
# The code is largely like it was above.
training_data_shuffle = training_data.sample(frac = 1)
training_data_clean = [re.sub(r'[^a-zA-Z0-9\s]', ' ', i) for i in training_data_shuffle['Text']]
training_data_clean = [x.lower() for x in training_data_clean]
training_data_clean_tokens = [nltk.word_tokenize(i) for i in training_data_clean]


training_data_clean

Using TensorFlow backend.


['anna hazare failed in his drama on kejriwal case  hazare aim is to remove congress from power and he joined with kejriwal  kejri is clever and he overcome hazare and defeated bjp and opposing till now  anna hazare wont speak if bjp involve in money scandal  communal violence  hate speeches etc etc  anna hazare is a hypocritic poison human being  hazare never thought kejri will come into politics  hazare may won in defeating congress but lost on kejri matter  great actor hazare',
 'cyrus mistry is a great leader looks more like rata tata s doing he wanted to return becuase of his previous bad deals  ',
 'yes we  remember u r biggest terrorist country in the world    u will do anything against humanity   ',
 'now that s clear cut key points for budget    thanks for bring this to us',
 'our indian f hrer is also obsessed with his religious ideology  ours is miles ahead in religious bigotry and kills humans because of his hate ',
 'such stupid replies are expected from swami people die h

In [35]:
label = {'OAG': 1,'NAG': 2, 'CAG' : 3}
training_data_shuffle.Agression = [label[i] for i in training_data_shuffle.Agression] 
print(training_data_shuffle) 

                                                                          Text  \
Index                                                                            
facebook_corpus_msr_1804785  Anna hazare failed in his drama on kejriwal ca...   
facebook_corpus_msr_483448   Cyrus Mistry is a great leader looks more like...   
facebook_corpus_msr_1493818  Yes we  remember u r biggest terrorist country...   
facebook_corpus_msr_433721   Now that's clear cut key points for budget.......   
facebook_corpus_msr_331447   Our Indian Führer is also Obsessed with his Re...   
facebook_corpus_msr_470340   such stupid replies are expected from swami pe...   
facebook_corpus_msr_451793   Look at his face, looks like he saw the bike b...   
facebook_corpus_msr_1804990  Anna Hazare  is a failure. He could neither in...   
facebook_corpus_msr_439624   Indians are blessed to have you Sir, may your ...   
facebook_corpus_msr_2082474             Mr. Prateek Nishant.....secure method?   
facebook_corpus_

In [49]:
x = training_data_shuffle.Text
y = training_data_shuffle.Agression
# print(y[0])
# print(x[1])

In [37]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
y_enc = enc.fit_transform(training_data_shuffle.Agression.values.reshape(-1,1)).toarray()
# y.fit(enc)
# onehotlabels = enc.transform(y).toarray()
# onehotlabels.shape
y

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([1, 2, 1, ..., 2, 2, 2])

In [50]:
# I'm using sklearn's train_test_split function this time because it's much easier and faster than the manual method from before.
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=.20, random_state = 0)

In [51]:
xTrain

Index
facebook_corpus_msr_2241590    Then you should also watch Pappu he cracks muc...
facebook_corpus_msr_447354     Modi knows to talk n thts wht he is doing n gi...
facebook_corpus_msr_1723395    And at the time of your god there was loudspea...
facebook_corpus_msr_1521686    Hehe that was just banter dude😉 u r as much in...
facebook_corpus_msr_1723412    Once or twice or thrice a month or even a  yea...
facebook_corpus_msr_2082444    I have talked of cyber crime not of theft by p...
facebook_corpus_msr_470758           Y not let all elderly ppl to go directly ??
facebook_corpus_msr_483847     i am not from m.p\nAftr that i am happy , that...
facebook_corpus_msr_1735788    you already got a reserve country in 1947, you...
facebook_corpus_msr_382247     anybody from pak now in India, arrest them eve...
facebook_corpus_msr_382138     Ek bar odr do modi ki jarurat nai 10 sar lanay...
facebook_corpus_msr_327824     If you introspect, then one question comes in ...
facebook_corpus_msr_19

In [40]:
# Below I'm using keras's Tokenizer function, so that I can generate a numerical representation of each comment.
# This was taken from Lab 9.
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 2000)

tokenizer.fit_on_texts(xTrain)

xCV = tokenizer.texts_to_sequences(x)

X_train = tokenizer.texts_to_sequences(xTrain)
X_test = tokenizer.texts_to_sequences(xTest)

vocab_size = len(tokenizer.word_index) + 1

print(xTrain[0])
print(X_train[0])
vocab_size

['then', 'you', 'should', 'also', 'watch', 'pappu', 'he', 'cracks', 'much', 'bigger', 'jokes', 'even', 'he', 'can', 'bring', 'earthquake', 'listen', 'to', 'him', 'you', 'll', 'enjoy']
[68, 10, 45, 72, 350, 19, 141, 1537, 76, 19, 32, 449, 1168, 2, 81, 10, 577, 1396]


19256

In [52]:
# Now I'm importing the gensim package, so that I can use Word2Vec to find word embeddings.
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument
from gensim.models.keyedvectors import KeyedVectors

In [53]:
# It was recommended that I check how many cpu's are in my system, so that I don't upset my computer.
import multiprocessing
cores = multiprocessing.cpu_count()
print(cores)

4


In [54]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [55]:
all_x = xTrain + xTest
all_x_w2v = labelize_tweets_ug(all_x, 'all')

AttributeError: 'float' object has no attribute 'split'

In [281]:
#First I'm creating a Word2Vec Continuous Bag of Words model.
w2vSG0_model = Word2Vec(size = 100, window = 2, min_count = 10, workers = cores - 1, sg = 0, alpha=0.065, min_alpha=0.065)


In [282]:
w2vSG0_model.build_vocab(all_x_w2v, progress_per=10000)

In [283]:
# I'm training this model for 100 epochs because I'm hoping a large number iterations might be able to generate relatively strong vectors.
w2vSG0_model.train(xTrain, total_examples = w2vSG0_model.corpus_count, epochs = 100, report_delay = 1)

(15533997, 23825400)

In [284]:
# Here I'm only keeping the normalized vectors in order to try and save memory
w2vSG0_model.wv.init_sims(replace = True)

In [285]:
# Over the next few cells, I'm just testing out if the model seems to make sense.
w2vSG0_model.wv.most_similar(positive=["peace"])

[('nation', 0.37961965799331665),
 ('https', 0.36920952796936035),
 ('live', 0.35409122705459595),
 ('region', 0.34651491045951843),
 ('youtu', 0.34500783681869507),
 ('peaceful', 0.34375640749931335),
 ('end', 0.3416184186935425),
 ('producing', 0.3347014784812927),
 ('music', 0.33114105463027954),
 ('america', 0.32708632946014404)]

In [286]:
w2vSG0_model.wv.most_similar(positive=["hate"])

[('think', 0.4322175085544586),
 ('support', 0.412882924079895),
 ('are', 0.38888293504714966),
 ('love', 0.37320172786712646),
 ('hatred', 0.3555138111114502),
 ('am', 0.3463572859764099),
 ('r', 0.33274197578430176),
 ('pakistanis', 0.32726243138313293),
 ('kill', 0.3192037343978882),
 ('want', 0.3169896900653839)]

In [287]:
w2vSG0_model.wv.similarity("love", "terrorist")

0.17908365200099705

In [288]:
w2vSG0_model.wv.doesnt_match(['peace', 'islam', 'terrorist'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'peace'

In [289]:
w2vSG0_model.wv.doesnt_match(['cow', 'islam', 'hindu'])

'hindu'

In [None]:
# Here I'm doing basically the same process as before except for skip-gram embeddings instead of CBOW.
w2vSG1_model = Word2Vec(size=100, window = 2, min_count = 5, workers = cores - 1, sg = 1,  alpha=0.065, min_alpha=0.065)

In [None]:
w2vSG1_model.build_vocab(all_x_w2v, progress_per=10000)

In [None]:
w2vSG1_model.train(xTrain, total_examples = w2vSG1_model.corpus_count, epochs = 100, report_delay = 1)

In [None]:
w2vSG1_model.init_sims(replace = True)

In [None]:
w2vSG1_model.wv.most_similar(positive=["peace"])

In [None]:
w2vSG1_model.wv.most_similar(positive=["islam"])

In [None]:
w2vSG1_model.wv.doesnt_match(['peace', 'islam', 'terrorist'])

In [None]:
w2vSG1_model.wv.doesnt_match(['cow', 'islam', 'hindu'])

In [None]:
# I'm using .wv as this will allow me to use just the keyed vectors.
word_vectorsCBOW = w2vSG0_model.wv
word_vectorsSG = w2vSG1_model.wv

In [None]:
print(len(word_vectorsCBOW.wv.vocab.keys()))

In [None]:
embed_index = {}
for word in word_vectorsCBOW.wv.vocab.keys():
    embed_index[word] = np.append(word_vectorsCBOW.wv[word], word_vectorsSG.wv[word])

print('Found %s word vectors.' % len(embed_index))

In [None]:
length = []
for list in xTrain:
    for w in list:
        length.append(len(w))
max(length)

In [None]:
length = []
for list in xTest:
    for w in list:
        length.append(len(w))
max(length)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X_train_seq = pad_sequences(X_train, maxlen = 90)
X_test_seq = pad_sequences(X_test, maxlen = 90)
# y_Train_cat = to_categorical(yTrain)
# y_Test_cat = to_categorical(yTest)
# type(X_train_seq)
# X_train_seq = X_train_seq.tolist()
# X_test_seq = X_test_seq.tolist()

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential, Model
# from keras import layers
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten, Dense, Input, AveragePooling1D, GlobalMaxPooling1D

In [None]:
Embedding_dim = 200
Max_sequence_length = 90

In [None]:

embed_matrix = np.zeros((vocab_size, Embedding_dim))
for word, i in tokenizer.word_index.items():
    embed_vector = embed_index.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector


In [None]:
embedding_layer = Embedding(vocab_size,
                            Embedding_dim,
                            weights = [embed_matrix],
                            input_length = Max_sequence_length,
                            trainable = False)

In [None]:
np.array_equal(embed_matrix[13], embed_index.get('this'))
X_train

In [None]:
X_train_seq.shape

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# # compile the model
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

# # evaluate the model
# loss, accuracy, f1_score, precision, recall = model.evaluate(X_test_seq, yTest, verbose=0)

In [None]:
# def create_model():

seq_input = Input(shape = (Max_sequence_length,))
embed_seq = embedding_layer(seq_input)
# embed_seq = embedding_layer(vocab_size, input_length=Max_sequence_length)
# m = Sequential()
# m = Embedding(vocab_size, Embedding_dim, weights = [embed_matrix], input_length = Max_sequence_length, trainable = False)(m)
m = Conv1D(128, 5, activation = 'relu')(embed_seq)
m = GlobalMaxPooling1D()(m)
# m = AveragePooling1D(5)(m)
# m = MaxPooling1D(5)(m)
# m = Flatten()(m)
m = Dense(10, activation = 'relu')(m)
preds = Dense(1, activation = 'softmax')(m)

model = Model(seq_input, preds)
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])
#     return model
model.summary()

# model.fit(X_train_seq, yTrain, validation_data = (X_test_seq, yTest),
#           epochs = 10, batch_size = 128)

In [None]:
# CNN = KerasClassifier(build_fn = create_model, epochs = 10, batch_size = 128)
model.fit(X_train_seq, yTrain, validation_data = (X_test_seq, yTest), epochs = 10, batch_size = 128)
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test_seq, yTest, verbose=False)

In [None]:
loss, accuracy = model.evaluate(X_train_seq, yTrain, verbose=False)
# print("Training Accuracy: {:.4f}".format(accuracy))
# print(X_test_seq.shape)
# print(yTest.shape)

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_seq, batch_size=128, verbose=1)
# y_pred_bool = np.argmax(y_pred, axis=1)

target_names = ['1', '2', '3']
# print(classification_report(yTest, y_pred))
classification_report(yTest, y_pred, target_names = target_names)

In [None]:
y_pred.shape

In [None]:
yTest.shape