In [143]:
import pandas as pd
from sklearn import datasets
import nltk
# nltk.download('punkt') if not installed
import string
import re

#make output folder
import os
os.makedirs("./output")

In [144]:
train = pd.read_csv('dataset_train.csv')

In [145]:
train

Unnamed: 0,id,Tweets,Label
0,7281,The jokes and puns are flying free in this cam...,none
1,7282,#MKR Lets see who the producers think are goin...,none
2,7283,Praying Jac and Shaz do well! They're my faves...,none
3,7284,RT @realityraver: Pete Evans the Paleo Capital...,none
4,7285,If Kat and Andre stay tonight I will stop watc...,none
...,...,...,...
12824,5884,"RT @immichaeldixon: Katie and Nikki, smug, vac...",sexism
12825,15551,I can barely watch the #MKR episode of Katie a...,sexism
12826,15612,Gay fianc� is not going to cope being away fro...,sexism
12827,15638,#MKR you'd think in her downtime Annie would h...,sexism


In [146]:
train_x = train["Tweets"]
train_x 
train_y = train["Label"].factorize()[0]
train_y

array([0, 0, 0, ..., 2, 2, 2], dtype=int64)

In [148]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
ps = PorterStemmer()

from nltk.corpus import stopwords
#nltk.download('stopwords')


# https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
#https://pythonhealthcare.org/2018/12/14/101-pre-processing-data-tokenization-stemming-and-removal-of-stop-words/
#stem train_x and remove punctuations and stop words
def stemSentence(sentence):
    #exclude gender pronouns and negations
    stopwordsRemove = ['he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'against', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    stopwordsFinal = [x for x in stopwords.words('english') if x not in stopwordsRemove]
    
    #tokenise and remove punctuations
    token_words=word_tokenize(sentence.translate(str.maketrans('', '', string.punctuation)))
    stem_sentence=[]
    for word in token_words:
        if word in stopwordsFinal:
            word = ""
        #remove stopwords
        stem_sentence.append(ps.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def preprocess(tweets_df):
    # remove RT, mentions and URL
    tweets_df = tweets_df.replace("RT", "", regex=True).replace(r"@\b\S+", "", regex=True).replace(r"http\S+", "", regex=True)
    #.replace(r"#\b", "", regex=True) to replace hashtags
    return [stemSentence(tweets) for tweets in tweets_df]

train_x = preprocess(train_x)

In [150]:
train_x

['the joke  pun  fli free   camp episod mkr mkr2015 ',
 'mkr let see   produc think  go   better TV kat  nikki  kati ',
 'pray jac  shaz  well theyr  fave mkr ',
 'pete evan  paleo capitalist   hi cookbook pull mkr ',
 'If kat  andr stay tonight I  stop watch My kitchen rule mkr ',
 'mkr kat  defin fair hypocrit ',
 'twist   plot the bottom 2 team leav I think everyon would  happi mkr ',
 'doesnt anyon   gif  manu say   get  co  would  awesom mkr ',
 'how bianca put   drasko  beyond  mkr ',
 'what iren   mkr ',
 'last night mkr look like  crossov  thewalkingdead zombi head toward  food ',
 'high qualiti food amp   frozen beef cheek mkr ',
 'someon today said  pete evan  mkr  never  film put food  hi mouth Is  true ',
 'It    pleasur  catch    mentor  former boss colin fassnidg mkr 4inhand fourfourteen ',
 'I went   differ thought altogeth   didnt fit  I   simplifi mkr mkr2015 ',
 'mkr  goggleboxau can  go fuck  selv ',
 'By fought hard kat mean vote strateg mkr ',
 'team deserv  oscar 

In [149]:
#from https://vgpena.github.io/classifying-tweets-with-keras-and-tensorflow/
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
import numpy as np

##obtain max
# create a new Tokenizer
tokenizer = Tokenizer()
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)
tokenizer

#get the word counts so we can weed out words used less than x times
word_counts = pd.DataFrame.from_dict(tokenizer.word_counts, orient='index', columns=['count'])
word_counts

Unnamed: 0,count
the,716
joke,84
pun,2
fli,21
free,71
...,...
downtim,1
napol�on,1
perdi,1
learnt,1


In [151]:
#word_length = len(tokenizer.word_index)
#get words with at least 2 counts
max_words = sum(word_counts["count"]>1)
max_words

5676

In [185]:
# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index
# Let's save this out so we can use it later
with open('output/dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)


def convert_text_to_index_array(text, dictionaryIn):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionaryIn[word] for word in kpt.text_to_word_sequence(text) if word in dictionaryIn]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text, dictionary)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
train_x_processed = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
# treat the labels as categories
train_y_processed = keras.utils.to_categorical(train_y)
#[none, racism, sexism]

In [163]:
train_y_processed

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [153]:
#making the model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy'])

In [154]:
model.fit(train_x_processed, train_y_processed,
  batch_size=32,
  epochs=5,
  verbose=1,
  validation_split=0.1,
  shuffle=True)
#accuracy about 0.9677

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x24499fe90d0>

In [156]:
#saving the file
model_json = model.to_json()
with open('output/model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('output/model.h5')
train_x

['the joke  pun  fli free   camp episod mkr mkr2015 ',
 'mkr let see   produc think  go   better TV kat  nikki  kati ',
 'pray jac  shaz  well theyr  fave mkr ',
 'pete evan  paleo capitalist   hi cookbook pull mkr ',
 'If kat  andr stay tonight I  stop watch My kitchen rule mkr ',
 'mkr kat  defin fair hypocrit ',
 'twist   plot the bottom 2 team leav I think everyon would  happi mkr ',
 'doesnt anyon   gif  manu say   get  co  would  awesom mkr ',
 'how bianca put   drasko  beyond  mkr ',
 'what iren   mkr ',
 'last night mkr look like  crossov  thewalkingdead zombi head toward  food ',
 'high qualiti food amp   frozen beef cheek mkr ',
 'someon today said  pete evan  mkr  never  film put food  hi mouth Is  true ',
 'It    pleasur  catch    mentor  former boss colin fassnidg mkr 4inhand fourfourteen ',
 'I went   differ thought altogeth   didnt fit  I   simplifi mkr mkr2015 ',
 'mkr  goggleboxau can  go fuck  selv ',
 'By fought hard kat mean vote strateg mkr ',
 'team deserv  oscar 

In [179]:
test = pd.read_csv('dataset_test.csv')
test_x = preprocess(test["Tweets"])
test_y = test["Label"].factorize()[0]

#open the dictionary previously created
with open('output/dictionary.json') as json_file:
    test_dictionary = json.load(json_file)
    
test_allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in test_x:
    wordIndices = convert_text_to_index_array(text, test_dictionary)
    test_allWordIndices.append(wordIndices)
    
test_allWordIndices = np.asarray(test_allWordIndices)

# create one-hot matrices out of the indexed tweets
test_x_processed = tokenizer.sequences_to_matrix(test_allWordIndices, mode='binary')

# treat the labels as categories
test_y_processed = keras.utils.to_categorical(test_y)
#[none, racism, sexism]

In [186]:
#analyse testing file
model.evaluate(test_x_processed, test_y_processed)
#accuracy 76.67%

model.evaluate(train_x_processed, train_y_processed)



[0.32421088218688965, 0.9282874464988708]