In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import spacy
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#help(pd.read_csv)
raw = pd.read_csv("/kaggle/input/hasoc-2019/english_dataset.tsv", sep='\t')
raw.head()
data = raw[{"text", "task_1", "task_2", "task_3"}]
data.head()



In [None]:
!python -m spacy info
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
tr = SentenceTransformer('paraphrase-distilroberta-base-v1')
tr.encode("hello there").shape

In [None]:
# convert the strings to word vecs

# old def (avg of word embeddings)
def to_wordvec(string):
    with nlp.disable_pipes():
        vectors = np.array([nlp(string).vector])
        return vectors

# new sentence embeddings
def to_wordvec(string):
    vectors = tr.encode(string)
    return [vectors]

#data["content"]=data["content"].apply(to_wordvec)
vecsSeries=tr.encode(data.text.values)
data.to_csv("text_hate_vecs.csv");
data.head()

In [None]:
# convert the dataframe to numpy X and y
#data["content"][1]
#data["content"].shape
#data["content"].to_numpy().shape
type(vecsSeries)

X = vecsSeries
#X = np.squeeze(np.stack(data.content.values))
y = data.task_1.values
X.shape


In [None]:
#split training an test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.1, random_state=1)

type(X_train[0])
X_train.shape
#X_train



In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression(random_state=1, max_iter=1000)
# Fit the model
model.fit(X_train, y_train)

# Uncomment and run to see model accuracy
print(f'Model test accuracy: {model.score(X_test, y_test)*100:.3f}%')

def decode_class(Y):
    return Y

In [None]:
print(decode_class(model.predict(to_wordvec("This is the end"))))
print(decode_class(model.predict(to_wordvec("I love Donald Trump"))))
print(decode_class(model.predict(to_wordvec("The Playstations are blocking the Suez Canal"))))
print(decode_class(model.predict(to_wordvec("I am scared of this"))))
print(decode_class(model.predict(to_wordvec("Yo listen up, here's a story"))))
print(decode_class(model.predict(to_wordvec("I cannot believe this"))))
print(decode_class(model.predict(to_wordvec("DJ Khaled: suffering from success"))))
print(decode_class(model.predict(to_wordvec("Cut my life into pieces, this is my last resort"))))

In [None]:
#kfold
print("Doing k-fold...")
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
#encode y as one hot for NN
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

encoder = LabelEncoder()
encoder.fit(y_train)

def encode_class(Y):
    encoded_Y = encoder.transform(Y)
    # convert integers to dummy variables (i.e. one hot encoded)
    dummy_y = np_utils.to_categorical(encoded_Y)
    return dummy_y

def decode_class(Y):
    Y_argmax = np.argmax(Y)
    return encoder.inverse_transform([Y_argmax])

y_train_enc = encode_class(y_train)
y_test_enc = encode_class(y_test)

decode_class(y_train_enc[0])
X_test[0].shape


In [None]:
#Lets now try with a NN
from tensorflow import keras
from tensorflow.keras import layers, callbacks

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

model = keras.Sequential([
    layers.Dense(1024, activation='relu', input_shape=[768]),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(2, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics = ['accuracy']
)

history = model.fit(
    X_train, y_train_enc,
    validation_data=(X_test, y_test_enc),
    batch_size=256,
    epochs=100,
    callbacks=[early_stopping],
    verbose=1,
)


# Show the learning curves
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
model.evaluate(X_test, y_test_enc, verbose=1)

In [None]:
#for layer in model.layers:
#    print(layer.input_shape)
# new sentence embeddings
def to_wordvec(string):
    vectors = tr.encode(string)
    return vectors.reshape(1, 768)


print(decode_class(model.predict(to_wordvec("This is the end"))))
print(decode_class(model.predict(to_wordvec("I love Donald Trump"))))
print(decode_class(model.predict(to_wordvec("The Playstations are blocking the Suez Canal"))))
print(decode_class(model.predict(to_wordvec("I am scared of this"))))
print(decode_class(model.predict(to_wordvec("Yo listen up, here's a story"))))
print(decode_class(model.predict(to_wordvec("I cannot believe this"))))
print(decode_class(model.predict(to_wordvec("DJ Khaled: suffering from success"))))
print(decode_class(model.predict(to_wordvec("Cut my life into pieces, this is my last resort"))))

In [None]:
print(decode_class(model.predict(to_wordvec("Hate is a solution"))))

print(decode_class(model.predict(to_wordvec("You should go fuck yourself"))))

print(decode_class(model.predict(to_wordvec("Trump"))))

print(decode_class(model.predict(to_wordvec("Timmy Trumpet"))))

print(decode_class(model.predict(to_wordvec("Timmy Trumpet should join them in hell, such a traitor"))))

print(decode_class(model.predict(to_wordvec("Donald Trump should join them in hell, such a traitor"))))

In [None]:
# let us see how the model guessed wrong
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


for i in range(200,300):
    X = data.text[i]
    y = data.task_1[i]
    ans = decode_class(model.predict(to_wordvec(X)))[0]
    
    if(y != "HOF"):
        continue
    
    if(y == ans):
        pass
        #continue
    
    print("Sentence is: "+bcolors.HEADER+X+bcolors.ENDC)
    print("Model guess: "+bcolors.OKBLUE+ans+bcolors.ENDC)
    print("Answer is: "+bcolors.OKBLUE+y)
    print(bcolors.ENDC+"\n")