In [1]:
from nltk.util import pr
# pr - pretty print a sequence of data items
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

import re
import nltk

stemmer = nltk.SnowballStemmer("english")
# stemming removes extra characters from words, like "ing", "s", "es"

from nltk.corpus import stopwords
import string


In [2]:
# Getting stop words
stopword = set(stopwords.words("English"))
stopword

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [3]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [8]:
data["labels"] = data['class'].map({0:"No hate and offensive",
                                    1: "Hate speech",
                                    2: "Offensive Language"})
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,Offensive Language
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Hate speech
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Hate speech
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Hate speech
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Hate speech


In [9]:
dataset = data[["tweet", "labels"]]
print(dataset.head())

                                               tweet              labels
0  !!! RT @mayasolovely: As a woman you shouldn't...  Offensive Language
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...         Hate speech
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...         Hate speech
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...         Hate speech
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...         Hate speech


In [10]:
print(dataset['tweet'][0])

!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...


## understanding the regex pattern used for cleaning the data.

In [74]:
text1 = dataset['tweet'][11]
text1 = '''my name is [shivam] sharma] , mt 123  23 3 3333 a3sd233 a3sd [assa] https\\www.facebook, www.wikipedia.com hi 
            <word1> <word2> <<word 3>> sds '''

### 1. searching words in [word] bracket  

In [69]:
print(re.findall('\[.*?\]', text1))
print(re.findall('\[.*\]', text1))
# . matches any character, matches 0 or more preceeding tokens, ? make * lazy and allow it to matches as few as possible/ 

['[shivam]', '[assa]']
['[shivam] sharma] , mt 123  23 3 3333 a3sd233 asd [assa]']


### 2. searching urls in text

In [70]:
re.findall('https?://\S+|www\.\S+',  text1)
# \S - matches any non whitespace character
# | (OR) matches either before it or after it  
# \. act as .

['www.facebook,', 'www.wikipedia.com']

## 3. searching words in corner bracket <words> 

In [71]:
re.findall('<.*?>+', text1)

['<word1>', '<word2>', '<<word 3>>']

## 4. finding punctuation in text

In [72]:
re.findall('[%s]'%re.escape(string.punctuation), text1)

['[',
 ']',
 ']',
 ',',
 '[',
 ']',
 '\\',
 '.',
 ',',
 '.',
 '.',
 '<',
 '>',
 '<',
 '>',
 '<',
 '<',
 '>',
 '>']

## 5. finding numbers in text

In [76]:
re.findall('\w*\d\w*', text1)

['123', '23', '3', '3333', 'a3sd233', 'a3sd', 'word1', 'word2', '3']

In [81]:
[word for word in text1.split(' ') if word not in stopword]

['name',
 '[shivam]',
 'sharma]',
 ',',
 'mt',
 '123',
 '',
 '23',
 '3',
 '3333',
 'a3sd233',
 'a3sd',
 '[assa]',
 'https\\www.facebook,',
 'www.wikipedia.com',
 'hi',
 '\n',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '<word1>',
 '<word2>',
 '<<word',
 '3>>',
 'sds',
 '']

### Cleaning the data

In [82]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]'%re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

In [83]:
dataset['tweet'] = dataset['tweet'].apply(clean)
dataset.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['tweet'] = dataset['tweet'].apply(clean)


Unnamed: 0,tweet,labels
0,rt mayasolov woman shouldnt complain clean ho...,Offensive Language
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,Hate speech
2,rt urkindofbrand dawg rt ever fuck bitch sta...,Hate speech
3,rt cganderson vivabas look like tranni,Hate speech
4,rt shenikarobert shit hear might true might f...,Hate speech
5,tmadisonx shit blow meclaim faith somebodi sti...,Hate speech
6,brighterday sit hate anoth bitch got much shi...,Hate speech
7,caus im tire big bitch come us skinni,Hate speech
8,amp might get ya bitch back amp that,Hate speech
9,rhythmixx hobbi includ fight mariam\r\rbitch,Hate speech


## Training Decision Tree classifier

In [84]:
x = np.array(dataset['tweet'])
y = np.array(dataset['labels'])

cv = CountVectorizer()
x = cv.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [102]:
x.shape

(24783, 25540)

In [101]:
X_train.shape

(17348, 25540)

In [85]:
clf.score(X_test, y_test)

0.8671149966375252

In [94]:
sample = 'I hate you.'
sample = cv.transform([sample]).toarray()
print(clf.predict(sample))

['Hate speech']
