In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### OLID (Offensive Language Identification Dataset)
### Predicting the Type and Target of Offensive Posts in Social Media

### Import Libraries

In [None]:
import pandas as pd
import string
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale,StandardScaler
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.models import Model
import tensorflow as tf

from warnings import filterwarnings
filterwarnings("ignore")

### Load the Data

In [None]:
train_data=pd.read_csv('../input/olid-dataset/olid-training-v1.0.tsv', delimiter='\t', encoding='utf-8')

train_tweets = train_data[['tweet']] #Extract tweets
train_task_a_labels= train_data[['subtask_a']] #Extract subtsak_a labels
train_task_b_labels= train_data[['subtask_b']] #Extract subtsak_b labels
train_task_c_labels= train_data[['subtask_c']] #Extract subtsak_c labels

train_task_a_labels.columns.values[0] = 'class_a' #Rename class attribute
train_task_b_labels.columns.values[0] = 'class_b' #Rename class attribute
train_task_c_labels.columns.values[0] = 'class_c' #Rename class attribute

print(train_data)

### Text Preprocessing

In [None]:
def clean_tweets(df):
    
    punctuations = string.punctuation
    
    df.loc[:, 'tweet'] = df.tweet.str.replace('@USER', '') #Remove mentions (@USER)
    df.loc[:, 'tweet'] = df.tweet.str.replace('URL', '') #Remove URLs
    df.loc[:, 'tweet'] = df.tweet.str.replace('&amp', 'and') #Replace ampersand (&) with and
    df.loc[:, 'tweet'] = df.tweet.str.replace('&lt','') #Remove &lt
    df.loc[:, 'tweet'] = df.tweet.str.replace('&gt','') #Remove &gt
    df.loc[:, 'tweet'] = df.tweet.str.replace('\d+','') #Remove numbers
    df.loc[:, 'tweet'] = df.tweet.str.lower() #Lowercase

    #Remove punctuations
    for punctuation in punctuations:
        df.loc[:, 'tweet'] = df.tweet.str.replace(punctuation, '')

    df.loc[:, 'tweet'] = df.astype(str).apply(
        lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii')
    )
    #Remove emojis
    df.loc[:, 'tweet'] = df.tweet.str.strip() #Trim leading and trailing whitespaces

In [None]:
clean_tweets(train_tweets)

In [None]:
train_task_a_data = train_tweets.join(train_task_a_labels)

train_task_b_data = train_tweets.join(train_task_b_labels)
train_task_b_data = train_task_b_data.dropna() #Drop records with missing values

train_task_c_data = train_tweets.join(train_task_c_labels)
train_task_c_data = train_task_c_data.dropna() #Drop records with missing values

#Apply quotes to cleaned tweets
train_task_a_data.update(train_task_a_data[['tweet']].applymap('\'{}\''.format))
train_task_b_data.update(train_task_b_data[['tweet']].applymap('\'{}\''.format))
train_task_c_data.update(train_task_c_data[['tweet']].applymap('\'{}\''.format))

In [None]:
train_task_a_data.head()

In [None]:
#Read tweets from test sets
test_tweet_a=pd.read_csv('../input/olid-dataset/testset-levela.tsv', delimiter='\t', encoding='utf-8')
test_tweet_b=pd.read_csv('../input/olid-dataset/testset-levelb.tsv', delimiter='\t', encoding='utf-8')
test_tweet_c=pd.read_csv('../input/olid-dataset/testset-levelc.tsv', delimiter='\t', encoding='utf-8')

#Read tweet labels
test_label_a=pd.read_csv('../input/olid-dataset/labels-levela.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_a'])
test_label_b=pd.read_csv('../input/olid-dataset/labels-levelb.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_b'])
test_label_c=pd.read_csv('../input/olid-dataset/labels-levelc.csv', encoding='utf-8', 
                         index_col=False, names=['id', 'class_c'])

#Merge tweets with labels by id
test_tweet_a = test_tweet_a.merge(test_label_a, on='id')
test_tweet_b = test_tweet_b.merge(test_label_b, on='id')
test_tweet_c = test_tweet_c.merge(test_label_c, on='id')

#Drop id column
test_tweet_a = test_tweet_a.drop(columns='id')
test_tweet_b = test_tweet_b.drop(columns='id')
test_tweet_c = test_tweet_c.drop(columns='id')

#Clean tweets in test sets
clean_tweets(test_tweet_a)
clean_tweets(test_tweet_b)
clean_tweets(test_tweet_c)

#Apply quotes to cleaned tweets
test_tweet_a.update(test_tweet_a[['tweet']].applymap('\'{}\''.format))
test_tweet_b.update(test_tweet_b[['tweet']].applymap('\'{}\''.format))
test_tweet_c.update(test_tweet_c[['tweet']].applymap('\'{}\''.format))

test_tweet_a.head()

In [None]:
def count_classes(df,task):
    sns.countplot(df[task])
    plt.title("Classes in dataset")

count_classes(train_task_a_data,"class_a")

In [None]:
count_classes(test_tweet_a,"class_a")

### Bidirectional LSTM

### TASK A

In [None]:
# Creating labelEncoder
encoder = LabelEncoder()
# Converting string labels into numbers.
train_task_a_data["class_a_code"] = encoder.fit_transform(train_task_a_data["class_a"])
test_tweet_a["class_a_code"] = encoder.fit_transform(test_tweet_a["class_a"])

In [None]:
#Create tuple pair for class and class code
train_task_a_data['class-tuple'] = train_task_a_data[['class_a', 'class_a_code']].apply(tuple, axis=1)
class_a = train_task_a_data['class-tuple'].unique()
class_a

In [None]:
# Label the Data

train_tweets = train_task_a_data['tweet']
train_labels = train_task_a_data['class_a_code']

X_test = test_tweet_a['tweet']
y_test = test_tweet_a['class_a_code']


print("Shape of train_tweet is {} and shape of train_label is {}".format(train_tweet.shape, train_label.shape))
print("Shape of test_tweet is {} and shape of test_label is {}".format(X_test.shape, y_test.shape))

In [None]:
X_train,X_val,y_train,y_val = train_test_split(train_tweets,train_labels,test_size=0.30,random_state=42)


print("Shape of test_tweets is {} and shape of test_labels is {}".format(X_test.shape, y_test.shape))

In [None]:
# TOKENIZER
max_features = 10000
embedding_dim = 128
max_len=500

tokenizer=Tokenizer(num_words=max_features,oov_token='</OOV>')
tokenizer.fit_on_texts(X_train.values)
dic=tokenizer.word_index
#print(dic)

In [None]:
# TRAIN
X_train_seq = tokenizer.texts_to_sequences(X_train.values)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)

print("train data tensor:" ,X_train_pad.shape)

In [None]:
# VAL
X_val_seq = tokenizer.texts_to_sequences(X_val.values)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

print("test data tensor:" ,X_val_pad.shape)

In [None]:
# TEST
X_test_seq = tokenizer.texts_to_sequences(X_test.values)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

print("test data tensor:" ,X_test_pad.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Bidirectional

In [None]:
model= Sequential()
model.add(Embedding(max_features,embedding_dim,input_length=max_len))
model.add(Bidirectional(LSTM(128, dropout=0.3)))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.summary()

In [None]:
import tensorflow as tf
METRICS = [tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall")]


model.compile(loss='binary_crossentropy',optimizer='adam',metrics=METRICS)

In [None]:
epochs = 10

# Fit the model using the train and test datasets.
history = model.fit(X_train_pad,y_train,validation_data=(X_val_pad,y_val), epochs=epochs)

In [None]:
scores = model.evaluate(X_test_pad, y_test, return_dict=True)

print(scores)

In [None]:
y_pred = model.predict(X_test_pad)

In [None]:
for i in range(len(y_pred)):
    if(y_pred[i] > 0.5):
        y_pred[i] = 1
    else:
        y_pred[i] =0 

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
def plot_conf_matrix(test_label,predicted_label):
    cm = confusion_matrix(test_label,predicted_label)
    class_names=[0,1] # name  of classes
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cm), annot=True, cmap="BuPu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label');

plot_conf_matrix(y_test,y_pred)

### TASK B

In [None]:
# Our predicted OFF - offensive data
df_pred = pd.DataFrame(data=y_pred,columns=["predicted"])
predicted_data = pd.concat([test_tweet_a, df_pred],axis=1)
off_data = predicted_data[predicted_data.predicted ==1]
off_data.head()

In [None]:
# prepare test data for task B
test_b = off_data.merge(test_tweet_b,on="tweet")
test_b = test_b[["tweet","class_b"]]
print(test_b.shape)
test_b.head()

In [None]:
# prepare train data for task B
# dataframe task_a and task_b merged
df_a_b = pd.concat([train_task_a_data,train_task_b_data],axis=1)
df_a_b = df_a_b[df_a_b.class_a =="OFF"]
df_a_b = df_a_b[["tweet","class_b"]]
df_a_b= df_a_b.T.drop_duplicates().T
train_b = df_a_b.copy()
print("Data shape without undersampling:",train_b.shape)
train_b.head()

train_b_unt = train_b[train_b["class_b"] == "UNT"]
train_b_tin = train_b[train_b["class_b"] == "TIN"]
train_b_tin = train_b_tin.sample(1000)
train_b_new = pd.concat([train_b_unt,train_b_tin])
print("Data shape with undersampling",train_b_new.shape)
train_b_new.head()

In [None]:
# Before under-sampling the train data<
count_classes(train_b,"class_b")

In [None]:
# After under-sampling the train data
count_classes(train_b_new,"class_b")

In [None]:
# Plot test data 
count_classes(test_b,"class_b")

In [None]:
#creating labelEncoder
encoder = LabelEncoder()
# Converting string labels into numbers.
train_b["class_b_code"] = encoder.fit_transform(train_b["class_b"])
train_b_new["class_b_code"] = encoder.fit_transform(train_b_new["class_b"])
test_b["class_b_code"] = encoder.fit_transform(test_b["class_b"])

In [None]:
#Create tuple pair for class and class code
train_b['class-tuple'] = train_b[['class_b', 'class_b_code']].apply(tuple, axis=1)
class_b = train_b['class-tuple'].unique()
class_b

In [None]:
# Set the labels and train-test split

train_tweets = train_b_new['tweet']
train_labels = train_b_new['class_b_code']

X_test = test_b['tweet']
y_test = test_b['class_b_code']

X_train,X_val,y_train,y_val = train_test_split(train_tweets,train_labels,test_size=0.30,random_state=42)

print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

print("Shape of test_tweets is {} and shape of test_labels is {}".format(X_test.shape, y_test.shape))

In [None]:
# TRAIN
X_train_seq = tokenizer.texts_to_sequences(X_train.values)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)

print("train data tensor:" ,X_train_pad.shape)

In [None]:
# VAL
X_val_seq = tokenizer.texts_to_sequences(X_val.values)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

print("test data tensor:" ,X_val_pad.shape)

In [None]:
# TEST
X_test_seq = tokenizer.texts_to_sequences(X_test.values)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

print("test data tensor:" ,X_test_pad.shape)

In [None]:
epochs = 10

# Fit the model using the train and test datasets.
history = model.fit(X_train_pad,y_train,validation_data=(X_val_pad,y_val), epochs=epochs)

In [None]:
scores = model.evaluate(X_test_pad, y_test, return_dict=True)

print(scores)

In [None]:
y_pred = model.predict(X_test_pad)

In [None]:
for i in range(len(y_pred)):
    if(y_pred[i] > 0.5):
        y_pred[i] = 1
    else:
        y_pred[i] =0 

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plot_conf_matrix(y_test,y_pred)

### TASK C

In [None]:
# Our predicted TIN - Targeted Insult data
df_pred = pd.DataFrame(data=y_pred,columns=["predicted"])
predicted_data = pd.concat([test_tweet_b, df_pred],axis=1)
tin_data = predicted_data[predicted_data.predicted ==0]
print(tin_data.shape)
tin_data.head()

In [None]:
# prepare test data for task C
test_c = tin_data.merge(test_tweet_c,on="tweet")
test_c = test_c[["tweet","class_c"]]
print(test_c.shape)
test_c.head()

In [None]:
# prepare train data for task C
# dataframe task_b and task_c merged
df_b_c = pd.concat([train_task_b_data,train_task_c_data],axis=1)
df_b_c = df_b_c[df_b_c.class_b =="TIN"]
df_b_c = df_b_c[["tweet","class_c"]]
df_b_c= df_b_c.T.drop_duplicates().T
train_c = df_b_c.copy()
print("Data shape without over and undersampling:",train_c.shape)
train_c.head()


train_c_ind = train_c[train_c["class_c"] == "IND"]
train_c_oth = train_c[train_c["class_c"] == "OTH"]
train_c_grp = train_c[train_c["class_c"] == "GRP"]
train_c_ind = train_c_ind.sample(1500)
train_c_oth = train_c_oth.sample(1000, replace=True)
train_c_grp = train_c_grp.sample(1000)
train_c_new = pd.concat([train_c_ind,train_c_oth,train_c_grp])
print("Data shape with undersampling",train_c_new.shape)
train_c_new.head()

In [None]:
# Plot the train data without over and undersapmling the classes

count_classes(train_c,"class_c")

In [None]:
# Plot the train data with over and undersapmling the classes

count_classes(train_c_new,"class_c")

In [None]:
# Plot test data 
count_classes(test_c,"class_c")

In [None]:
#creating labelEncoder
encoder = LabelEncoder()
# Converting string labels into numbers.
train_c["class_c_code"] = encoder.fit_transform(train_c["class_c"])
train_c_new["class_c_code"] = encoder.fit_transform(train_c_new["class_c"])
test_c["class_c_code"] = encoder.fit_transform(test_c["class_c"])

In [None]:
#Create tuple pair for class and class code
train_c['class-tuple'] = train_c[['class_c', 'class_c_code']].apply(tuple, axis=1)
class_c = train_c['class-tuple'].unique()
class_c

In [None]:
# Set the labels and train-test split

train_tweets = train_c_new['tweet']
train_labels = train_c_new['class_c_code']

X_test = test_c['tweet']
y_test = test_c['class_c_code']

X_train,X_val,y_train,y_val = train_test_split(train_tweets,train_labels,test_size=0.30,random_state=42)

print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_val.shape, y_val.shape))

print("Shape of test_tweets is {} and shape of test_labels is {}".format(X_test.shape, y_test.shape))

In [None]:
# TRAIN
X_train_seq = tokenizer.texts_to_sequences(X_train.values)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)

print("train data tensor:" ,X_train_pad.shape)

In [None]:
# VAL
X_val_seq = tokenizer.texts_to_sequences(X_val.values)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

print("test data tensor:" ,X_val_pad.shape)

In [None]:
# TEST
X_test_seq = tokenizer.texts_to_sequences(X_test.values)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

print("test data tensor:" ,X_test_pad.shape)

In [None]:
model= Sequential()
model.add(Embedding(max_features,embedding_dim,input_length=max_len))
model.add(Bidirectional(LSTM(128, dropout=0.3)))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

In [None]:
METRICS = [tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall")]


model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=METRICS)

In [None]:
epochs = 10

# Fit the model using the train and test datasets.
history = model.fit(X_train_pad,y_train,validation_data=(X_val_pad,y_val), epochs=epochs)

In [None]:
scores = model.evaluate(X_test_pad, y_test, return_dict=True)

print(scores)

In [None]:
y_pred = model.predict(X_test_pad)

In [None]:
y_pred = y_pred.astype("int64")

In [None]:
plot_conf_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))