In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
#load data
dataFrameTraining = pd.read_csv("data/fetr.csv")
dataFrameTesting = pd.read_csv("data/fete.csv")

dftr = dataFrameTraining
dfte = dataFrameTesting

In [3]:
dftr.head()

Unnamed: 0,Comments,obscene,threat,insult,hate,Intolerant,comment_length,sentiment
0,I will break the neck of this muslim boy who i...,0,1,0,0,0,62,-0.4
1,Can any one help me to break the neck of my ne...,0,1,0,0,0,49,0.0
2,We should break the neck of all the muslims.,0,1,0,0,1,36,0.0
3,Everyone should unite and break the neck of th...,0,1,0,0,1,50,0.0
4,Let's group up and kill Ahmed.,0,1,0,0,0,25,0.0


In [4]:
stopWords = nltk.corpus.stopwords.words('english')
new_stop_words = ['?','!',',','.',';','&','>','<',')','(','/','\'s','\'\'','``']
stopWords.extend(new_stop_words)
new_stop_words_1 = ['I','thi','He','We','hi','everi','like','boy','march']
stopWords.extend(new_stop_words_1)

ps = nltk.PorterStemmer()

In [5]:
#for countvectorizer
def comment_clean_cv(comment):
    comment = "".join([word.lower() for word in comment if word not in string.punctuation])
    tokens = re.split('\W+', comment)
    comment = [ps.stem(word) for word in tokens if word not in stopWords]
    return comment

### CountVectorizer/Unigram

In [6]:
# countvectorizer
countVec = CountVectorizer(analyzer=comment_clean_cv)
X_countsCV = countVec.fit_transform(dftr['Comments'])
X_countsCV.shape

(140, 319)

In [7]:
countVec.get_feature_names()[5:15]

['aadil',
 'aaeedah',
 'aafia',
 'aafreeda',
 'aahila',
 'aalia',
 'aamir',
 'aasif',
 'aatif',
 'aazad']

In [8]:
X_countsCVDF = pd.DataFrame(X_countsCV.toarray())
X_countsCVDF.columns = countVec.get_feature_names()
X_countsCVDF.head()

Unnamed: 0,Unnamed: 1,5,72,9,aabirah,aadil,aaeedah,aafia,aafreeda,aahila,...,wit,wonder,world,worth,wrong,xxxcom,year,zeya,zinna,zubair
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# save to csv

### Bigram

In [10]:
dftr = dataFrameTraining
dfte = dataFrameTesting

In [11]:
def comment_clean_ngram(comment):
    comment = "".join([word.lower() for word in comment if word not in string.punctuation])
    tokens = re.split('\W+', comment)
    comment = " ".join([ps.stem(word) for word in tokens if word not in stopWords])
    return comment

dftr['Comments'] = dftr['Comments'].apply(lambda x: comment_clean_ngram(x))
dftr.head()

Unnamed: 0,Comments,obscene,threat,insult,hate,Intolerant,comment_length,sentiment
0,break neck muslim classmat fuck ratiq,0,1,0,0,0,62,-0.4
1,one help break neck neighbour rehman,0,1,0,0,0,49,0.0
2,break neck muslim,0,1,0,0,1,36,0.0
3,everyon unit break neck islamist,0,1,0,0,1,50,0.0
4,let group kill ahm,0,1,0,0,0,25,0.0


In [12]:
bigramVec = CountVectorizer(ngram_range=(2,2))
X_countsBigram = bigramVec.fit_transform(dftr['Comments'])
X_countsBigram.shape

(140, 507)

In [13]:
X_countsBigramDF = pd.DataFrame(X_countsBigram.toarray())
X_countsBigramDF.columns = bigramVec.get_feature_names()
X_countsBigramDF.head()

Unnamed: 0,72 time,aabirah dump,aadil pervert,aaeedah reveng,aafia venom,aafreeda march,aahila bitch,aalia neighbour,aamir pervert,aasif pimp,...,wife boss,wish best,wit everi,wonder make,world everi,world wide,xxxcom certifi,zeya arsehol,zinna dunc,zubair biggest
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#save bigram to csv

### TfIDf

In [15]:
tfidfVec = TfidfVectorizer(analyzer=comment_clean_cv, max_features=100)
X_tfidf = tfidfVec.fit_transform(dftr['Comments'])
X_tfidf.shape

(140, 100)

In [16]:
X_tfidfDF = pd.DataFrame(X_tfidf.toarray())
X_tfidfDF.columns = tfidfVec.get_feature_names()
X_tfidfDF.head()

Unnamed: 0,Unnamed: 1,akbar,alia,arsehol,asshol,aymaan,back,bad,becom,bitch,...,time,today,trump,unit,vagina,venom,villag,wanker,whore,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.505386,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_tfidfDF.columns

Index(['', 'akbar', 'alia', 'arsehol', 'asshol', 'aymaan', 'back', 'bad',
       'becom', 'bitch', 'bodi', 'bomb', 'break', 'choke', 'classmat', 'cloth',
       'come', 'cretin', 'dipstick', 'dislik', 'dont', 'dunc', 'end', 'ever',
       'everyon', 'first', 'friend', 'fuck', 'fucker', 'full', 'go', 'good',
       'group', 'guy', 'hate', 'help', 'hou', 'hurt', 'idiot', 'islam',
       'islamist', 'jerk', 'kill', 'know', 'larg', 'learn', 'let', 'london',
       'make', 'marri', 'mohammedan', 'moslem', 'motherfuck', 'mulla',
       'murder', 'muslim', 'nake', 'neck', 'neighbour', 'new', 'nuke', 'one',
       'peopl', 'pervert', 'pimp', 'put', 'ratiq', 'rehman', 'remain',
       'reveng', 'road', 'say', 'seen', 'sell', 'shia', 'singl', 'sister',
       'spit', 'squar', 'stab', 'starv', 'stone', 'stop', 'street', 'stupid',
       'sunni', 'take', 'tear', 'terrorist', 'till', 'time', 'today', 'trump',
       'unit', 'vagina', 'venom', 'villag', 'wanker', 'whore', 'year'],
      dtype='objec

In [88]:
#save tfidf