In [1]:
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, Dropout, Flatten, Dense, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
from keras.initializers import Constant

Using TensorFlow backend.


## 1. 데이터 로딩 / データのロード

- Data can be downloaded from:
- https://www.kaggle.com/vkrahul/twitter-hate-speech

In [2]:
df = pd.read_csv("train_tweets.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


## 2. 데이터 전처리 / データの前処理

- Create a function to convert contractions.

In [3]:
def convert_contractions(tweet):
    tweet = re.sub("won\'t", "will not", tweet)
    tweet = re.sub("can\'t", "can not", tweet)
    tweet = re.sub("\'re", " are", tweet)
    tweet = re.sub("\'ve", " have", tweet)
    tweet = re.sub("\'ll", " will", tweet)
    tweet = re.sub("\'d", " would", tweet)
    tweet = re.sub("n\'t", " not", tweet)
    tweet = re.sub(r"\'s", " is", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    return tweet

- Create a pre-processing function to remove unnecessary characters.

In [4]:
def preprocess_tweet(tweet):
    tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "", tweet.lower())  # remove URLs
    tweet = re.sub("@[^\s]+", '', tweet)  # remove mentions
    tweet = re.sub("#\w*", "", tweet)  # remove hashtags
    tweet = convert_contractions(tweet)  # convert contractions
    tweet = re.sub("[^0-9A-Za-z ]", "", tweet)  # remove special characters and punctuations
    tweet = re.sub("\s{2,}", " ", tweet)  # remove spaces
    return tweet

In [5]:
df["tweet"] = df["tweet"].apply(preprocess_tweet)
df.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so self...
1,2,0,thanks for credit i can not use cause they do...
2,3,0,bihday your majesty
3,4,0,i love u take with u all the time in ur
4,5,0,factsguide society now


## 3. scikit-learn을 이용한 기계학습 / scikit-learnを用いた機械学習

### Feature extraction

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df["tweet"], df["label"], random_state=0)

In [7]:
tfidf = TfidfVectorizer(stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Naive Bayes

In [8]:
model = BernoulliNB()
model.fit(X_train_tfidf, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

- The recall of the hate speech class was 0.11 i.e. only about 10% of the actual hate speech could be detected.

In [9]:
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      7460
           1       0.85      0.11      0.20       531

    accuracy                           0.94      7991
   macro avg       0.89      0.56      0.58      7991
weighted avg       0.93      0.94      0.92      7991



### Random Forest

- The f1-score of the hate speech class was increased because the detection rate(=recall) of hate speech was improved.

In [10]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      7460
           1       0.82      0.43      0.56       531

    accuracy                           0.96      7991
   macro avg       0.89      0.71      0.77      7991
weighted avg       0.95      0.96      0.95      7991



## 4. keras를 이용한 기계학습 / kerasを用いた機械学習

### Feature extraction

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df["tweet"], df["label"], random_state=0)

- Vectorize texts and convert to sequences.

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train.values)
x_train = tokenizer.texts_to_sequences(X_train.values)
x_test = tokenizer.texts_to_sequences(X_test.values)
x_train[:3]

[[138, 28, 14, 4686],
 [153, 16, 291],
 [436, 275, 12, 3, 24, 57, 265, 266, 265, 266]]

- Pad the sequences to the same length.

In [8]:
x_train = pad_sequences(x_train)
x_test = pad_sequences(x_test)
x_train[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,  138,   28,   14, 4686],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,  153,   16,  291]])

### LSTM

#### Embedding layer

- The Embedding layer is defined as the first hidden layer of a network.
- input_dim: This is the size of the vocabulary in the text data.
- output_dim: This is the size of the vector space in which words will be embedded.
- input_length: This is the length of input sequences.

#### LSTM layer

- units: This is the number of hidden units of the lstm layer.

In [9]:
vocab_size = len(tokenizer.word_index) + 1
embed_dim = 100
lstm_dim = 50

In [10]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=x_train.shape[1]))
model.add(Dropout(0.1))
model.add(LSTM(units=lstm_dim))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 100)           2036300   
_________________________________________________________________
dropout_1 (Dropout)          (None, 33, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 2,066,551
Trainable params: 2,066,551
Non-trainable params: 0
_________________________________________________________________


#### Train a model

In [11]:
batch_size = 32
epochs = 5
early_stopping = EarlyStopping(patience=3, verbose=1) 
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
         validation_split=0.1, callbacks=[early_stopping])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 21573 samples, validate on 2398 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 00004: early stopping


<keras.callbacks.callbacks.History at 0x236f4dfac18>

#### Result

- Compared to random forest, recall was increased at the expense of precision.

In [12]:
y_pred = model.predict_classes(x_test, batch_size=batch_size)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      7460
           1       0.66      0.49      0.56       531

    accuracy                           0.95      7991
   macro avg       0.81      0.74      0.77      7991
weighted avg       0.94      0.95      0.95      7991



### CNN using pretrained word embeddings

- See this page.
- https://keras.io/examples/pretrained_word_embeddings/

#### Load the GloVe

In [9]:
embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

#### Prepare embedding matrix

In [10]:
vocab_size = len(tokenizer.word_index) + 1
embed_dim = 100

In [11]:
embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

#### Convolutional layer

- filters: This is the number of filters in the convolution. (= the dimensionality of the output)
- kernel_size: This is the length of the 1D convolution window.

In [12]:
filters = 128
kernel_size = 5
dense_hidden_dim = 128

- If you want to connect a dense layer, you must flatten the 2D output matrix to the 1D vector.

In [13]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=x_train.shape[1],
          embeddings_initializer=Constant(embedding_matrix), trainable=False))
model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation="relu"))
model.add(GlobalMaxPooling1D())
# model.add(Flatten())
model.add(Dense(units=dense_hidden_dim, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 100)           2036300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 29, 128)           64128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 2,117,069
Trainable params: 80,769
Non-trainable params: 2,036,300
_______________________________________

In [14]:
batch_size = 32
epochs = 5
early_stopping = EarlyStopping(patience=3, verbose=1) 
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1,
         validation_split=0.1, callbacks=[early_stopping])

Train on 21573 samples, validate on 2398 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 00004: early stopping


<keras.callbacks.callbacks.History at 0x1b660063f98>

- You can get a model with similar performance much faster.

In [15]:
y_pred = model.predict_classes(x_test, batch_size=batch_size)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      7460
           1       0.65      0.49      0.56       531

    accuracy                           0.95      7991
   macro avg       0.81      0.74      0.77      7991
weighted avg       0.94      0.95      0.95      7991



## 5. 결과 확인 / 結果確認

In [16]:
result = pd.DataFrame(X_test, columns=["tweet"])
result["label"] = y_test
result["label_pred"] = y_pred

- Data predicted as hate speech

In [17]:
pd.set_option('display.max_colwidth', -1)
result.loc[result["label_pred"] == 1].head(10)

Unnamed: 0,tweet,label,label_pred
24123,your comments are reflections of ignorance and,1,1
4534,that rudd woman should think b4 she speaks a statement was asked for all she could do was try and stick the boot,0,1
13251,they are adopting the worst traits of america,0,1
3775,buffalo parents rally to remove trump ally carl paladino from school board after remarks,1,1
151,yes it is when you call a gorilla because racists have long thought of black people as no bet,1,1
10756,hahaha wow that is a statement from a 21 year old browns fan who is never seen a browns playoff game,0,1
13630,why would there be any reason to call out of her name haters,0,1
15624,trump real estate buddy carl paladino wishes obama dead of mad cow disease in 2017,1,1
15420,south sudan allowed soldiers to rape as wages un,0,1
31907,love that your statements came from the man who was rebuked by voters in 2008,0,1


- Data predicted as hate speech, but not actually

In [18]:
result.loc[(result["label_pred"] == 1) & (result["label"] != result["label_pred"])].head(10)

Unnamed: 0,tweet,label,label_pred
4534,that rudd woman should think b4 she speaks a statement was asked for all she could do was try and stick the boot,0,1
13251,they are adopting the worst traits of america,0,1
10756,hahaha wow that is a statement from a 21 year old browns fan who is never seen a browns playoff game,0,1
13630,why would there be any reason to call out of her name haters,0,1
15420,south sudan allowed soldiers to rape as wages un,0,1
31907,love that your statements came from the man who was rebuked by voters in 2008,0,1
24603,the useful idiots on the left are being duped by the alinskyites into giving up the second amendment,0,1
335,watching the leadership embrace amp kiss donald is ass is amp this is now the pay a pay of racism amp hate,0,1
2407,nude rear naughty naked school girls,0,1
11136,nigel farage this is not a happy europe ukip leader nigel farage has suggested that peace in europ,0,1
