## OLID (Offensive Language Identification Dataset)
### Predicting the Type and Target of Offensive Posts in Social Media

### Import Libraries

In [None]:
import pandas as pd
import string
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import seaborn as sns

from warnings import filterwarnings
filterwarnings("ignore")

### Load the data

In [None]:
train_data=pd.read_csv('data/olid-training-v1.0.tsv', delimiter='\t', encoding='utf-8')

train_tweets = train_data[['tweet']] #Extract tweets
train_task_a_labels= train_data[['subtask_a']] #Extract subtsak_a labels
train_task_b_labels= train_data[['subtask_b']] #Extract subtsak_b labels
train_task_c_labels= train_data[['subtask_c']] #Extract subtsak_c labels

train_task_a_labels.columns.values[0] = 'class_a' #Rename class attribute
train_task_b_labels.columns.values[0] = 'class_b' #Rename class attribute
train_task_c_labels.columns.values[0] = 'class_c' #Rename class attribute

#print(train_data)

In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))


### Data preprocessing

In [None]:
#Function to clean tweets in a data frame's tweet column
def clean_tweets(df):
    
   
    #Stopwords
    tweet_colm = df.iloc[:,0]
    data = tweet_colm[3]
    x = [word for word in data.split() if word.lower() not in stopwords]
    data_1 = " ".join(x)
    data_1 = [data_1]
    tweet_colm[3] = data_1[0]
    df.loc[:, 'tweet'] = tweet_colm
    
    punctuations = string.punctuation
    
    df.loc[:, 'tweet'] = df.tweet.str.replace('@USER', '') #Remove mentions (@USER)
    df.loc[:, 'tweet'] = df.tweet.str.replace('URL', '') #Remove URLs
    df.loc[:, 'tweet'] = df.tweet.str.replace('&amp', 'and') #Replace ampersand (&) with and
    df.loc[:, 'tweet'] = df.tweet.str.replace('&lt','') #Remove &lt
    df.loc[:, 'tweet'] = df.tweet.str.replace('&gt','') #Remove &gt
    df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers
    df.loc[:, 'tweet'] = df.tweet.str.lower() #Lowercase
    #data =  df.loc[:, 'tweet']
    #tweet = str(data)
    
    
    
    

    #Remove punctuations
    for punctuation in punctuations:
        df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')

    df.loc[:, 'tweet'] = df.astype(str).apply(
        lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii')
    )
    #Remove emojis
    df.loc[:, 'tweet'] = df.tweet.str.strip() #Trim leading and trailing whitespaces

In [None]:
clean_tweets(train_tweets)

In [None]:
train_task_a_data = train_tweets.join(train_task_a_labels)

train_task_b_data = train_tweets.join(train_task_b_labels)
train_task_b_data = train_task_b_data.dropna() #Drop records with missing values

train_task_c_data = train_tweets.join(train_task_c_labels)
train_task_c_data = train_task_c_data.dropna() #Drop records with missing values

#Apply quotes to cleaned tweets
train_task_a_data.update(train_task_a_data[['tweet']].applymap('\'{}\''.format))
train_task_b_data.update(train_task_b_data[['tweet']].applymap('\'{}\''.format))
train_task_c_data.update(train_task_c_data[['tweet']].applymap('\'{}\''.format))

# train_task_a_data.to_csv('olid_training_a.csv', index=None)
# train_task_b_data.to_csv('olid_training_b.csv', index=None)
# train_task_c_data.to_csv('olid_training_c.csv', index=None)

In [None]:
train_task_a_data.head()

### Preparing the test sets

In [None]:
#Read tweets from test sets
test_tweet_a=pd.read_csv('data/testset-levela.tsv', delimiter='\t', encoding='utf-8')
test_tweet_b=pd.read_csv('data/testset-levelb.tsv', delimiter='\t', encoding='utf-8')
test_tweet_c=pd.read_csv('data/testset-levelc.tsv', delimiter='\t', encoding='utf-8')

#Read tweet labels
test_label_a=pd.read_csv('data/labels-levela.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_a'])
test_label_b=pd.read_csv('data/labels-levelb.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_b'])
test_label_c=pd.read_csv('data/labels-levelc.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_c'])

#Merge tweets with labels by id
test_tweet_a = test_tweet_a.merge(test_label_a, on='id')
test_tweet_b = test_tweet_b.merge(test_label_b, on='id')
test_tweet_c = test_tweet_c.merge(test_label_c, on='id')

#Drop id column
test_tweet_a = test_tweet_a.drop(columns='id')
test_tweet_b = test_tweet_b.drop(columns='id')
test_tweet_c = test_tweet_c.drop(columns='id')

#Clean tweets in test sets
clean_tweets(test_tweet_a)
clean_tweets(test_tweet_b)
clean_tweets(test_tweet_c)

#Apply quotes to cleaned tweets
test_tweet_a.update(test_tweet_a[['tweet']].applymap('\'{}\''.format))
test_tweet_b.update(test_tweet_b[['tweet']].applymap('\'{}\''.format))
test_tweet_c.update(test_tweet_c[['tweet']].applymap('\'{}\''.format))


#Export to csv file
# test_tweet_a.to_csv('olid_test_a.csv', index=None,header=True)
# test_tweet_b.to_csv('olid_test_b.csv', index=None, header=True)
# test_tweet_c.to_csv('olid_test_c.csv', index=None, header=True)

test_tweet_a.head()

In [None]:
# Quick look at the topic numbers on the total dataset

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,4))
print("train data",train_task_a_data.groupby('class_a').class_a.count())
train_task_a_data.groupby('class_a').class_a.count().plot.bar(ylim=0)
plt.show()

In [None]:
# Quick look at the topic numbers on the total dataset

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,4))
print("test data",test_tweet_a.groupby('class_a').class_a.count())
test_tweet_a.groupby('class_a').class_a.count().plot.bar(ylim=0)
plt.show()

### 1st Model - MultinomialNB Naive Bayes

In [None]:
# Model

#creating labelEncoder
encoder = LabelEncoder()
# Converting string labels into numbers.
train_task_a_data["class_a_code"] = encoder.fit_transform(train_task_a_data["class_a"])
test_tweet_a["class_a_code"] = encoder.fit_transform(test_tweet_a["class_a"])

In [None]:
#Create tuple pair for class and class code
train_task_a_data['class-tuple'] = train_task_a_data[['class_a', 'class_a_code']].apply(tuple, axis=1)
class_a = train_task_a_data['class-tuple'].unique()
class_a

In [None]:
# Split the Dataset as train and test set

X_train = train_task_a_data['tweet']
y_train = train_task_a_data['class_a_code']

X_test = test_tweet_a['tweet']
y_test = test_tweet_a['class_a_code']


print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

In [None]:
# MultinomialNB Model
model= MultinomialNB()
pipeline_Mnv = Pipeline([('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=5, 
                                                        norm='l2', encoding='latin-1', 
                                                        ngram_range=(1, 2), stop_words='english')),
                         ('classifier', model)])

pipeline_Mnv.fit(X_train, y_train)
y_pred = pipeline_Mnv.predict(X_test)
accuracy_Mnv = accuracy_score(y_test, y_pred)
print("model accuracy:",accuracy_Mnv)
print("\n")
print(metrics.c lassification_report(y_test, y_pred))

### 2nd Model - LSTM Text Classification

In [None]:
train_task_b_data

In [None]:
train_task_b_data.info()

In [None]:
sns.countplot(train_task_b_data.class_b)
plt.xlabel('Label')
plt.title('Number of TIN(Targetet Insults) and UNT(Untargeted) tweets')

In [None]:
X_train = train_task_b_data.tweet
Y_train = train_task_b_data.class_b

X_test = test_tweet_b.tweet
Y_test = test_tweet_b.class_b


le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

Y_test = le.fit_transform(Y_test)
Y_test = Y_test.reshape(-1,1)


In [None]:
test_tweet_b

In [None]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Model


In [None]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model.evaluate(test_sequences_matrix,Y_test)

In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))