In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv')
test=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv')

In [None]:
def punc(df):
    df['tweet'] = df['tweet'].str.replace('[#]','')
    print(df)
punc(train)
punc(test)

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer
def tokenizer(df):
    tknzr = TweetTokenizer(strip_handles=True)
    df['tweet']= df['tweet'].apply(lambda x: tknzr.tokenize(x))
    print(df)
    
tokenizer(test)
tokenizer(train)

In [None]:
import nltk
from nltk.corpus import stopwords
stop=stopwords.words("english")
def stop_words(df):
    df['tweet']=df['tweet'].apply(lambda x: [i.lower() for i in x if i not in stop])
    print(df)
stop_words(train)
stop_words(test)

In [None]:
import re
def clean(df):
    df['tweet']=df['tweet'].apply(lambda x: [i for i in x if not re.match('[^\w\s]',i) and len(i)>3])
    print(df)
clean(train)
clean(test)

In [None]:
from nltk.stem import PorterStemmer
from textblob import Word
st = PorterStemmer()
def stemnlemm(df):
    df['tweet']=df['tweet'].apply(lambda x: ' '.join([Word(st.stem(i)).lemmatize() for i in x]))
    print(df)
stemnlemm(train)
stemnlemm(test)

In [None]:
X_train=pd.DataFrame(train['tweet'])
Y_train=pd.DataFrame(train['label'])

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_train,Y_train,random_state = 0 , stratify = Y_train)

In [None]:
x_train=x_train['tweet']
x_train

In [None]:
y_train=y_train['label']
y_train


In [None]:
import transformers
from tokenizers import BertWordPieceTokenizer
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased' , lower = True)
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=True)
fast_tokenizer

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=400):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in range(0, len(texts), chunk_size):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
x_test=x_test['tweet']
x_test

In [None]:
y_test=y_test['label']
y_test

In [None]:
x_train = fast_encode(x_train.values, fast_tokenizer, maxlen=400)
x_test = fast_encode(x_test.values, fast_tokenizer, maxlen=400)

In [None]:
import tensorflow as tf
from keras.layers import LSTM,Dense,Bidirectional,Input
from keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

def build_model(transformer, max_len=400):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
bert_model = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
model = build_model(bert_model, max_len=400)
model.summary()

In [None]:
x_train

In [None]:
history = model.fit(x_train,y_train,batch_size = 32 ,validation_data=(x_test,y_test),epochs = 3)


In [None]:
model.evaluate(x_test,y_test)[1]*100

In [None]:
pred=model.predict(x_test)


In [None]:
pred = np.round(pred).astype(int)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,pred)
cm

In [None]:
score = accuracy_score( y_test, pred)
print(score)