# Installing and Importing the libraries
We will keep the spark 2.4.5 because spark nlp is not updated for spark 3.0.0 yet, so we need to downgrade java to java 8 because spark 2.4 doesn't support Java 11, and will cause you the following exception: `IllegalArgumentException: 'Unsupported class file major version 55'`.

And java 8 is required for `pycontractions` package too!

In [None]:
! java -version

In [None]:
! apt remove -y openjdk-11-jre-headless
! apt update
! apt install -y openjdk-8-jdk openjdk-8-jre

In [None]:
! java -version

In [None]:
# you can even quietly install using pip install --quiet package
! pip install pyspark==2.4.6
! pip install spark-nlp==2.5.2
! pip install pycontractions

In [None]:
import numpy as np 
import pandas as pd 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import re

from pycontractions import Contractions
import gensim.downloader as api

import gc

In [None]:
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as sqlF
from pyspark import SparkContext, SparkConf


import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.embeddings import *

In [None]:
#initialize the spark session with spark nlp jars
spark = sparknlp.start(gpu=True)

sqlContext = SQLContext(sparkContext=spark.sparkContext, 
                        sparkSession=spark)

Need to read it with pandas, because spark was having some problems with NaNs, coundn't figure out the actual problem, so for now using pandas and after removing location and keyword converting it to spark dataframe is a work around.

# Data loading and some plots

In [None]:
train = pd.read_csv("../input/real-or-not-data-cleaned/train.csv")

In [None]:
train_spark = spark.createDataFrame(train[["id","text","target","clean_text"]])

In [None]:
data = train.groupby('target')['id'].count()

sns.barplot(x=data.index, y=data)
plt.ylabel("Frequency")
plt.xlabel("Target")
plt.show()

Code to clean the training data (same applies for test data) did it another notbook to save RAM memory space.

In [None]:
# model = api.load("glove-twitter-25")

# cont = Contractions(kv_model=model)
# cont.load_models()

In [None]:
# subtext_to_clearning = ['\!', '\$', '\(', '\)', '\*', 
#                         '\+', '\-', '\.', '\:', '\;', 
#                         '\=', '\?', '\@', '\[', '\]', 
#                         '\^', '\|', '\_', '\{', '\}']

# def clean(tweet):
#     tweet = re.sub("@[\w]*", "", tweet)    

# #     tweet = tweet.replace("@","")

#     #Remove the double or more punctuations
#     for punc in subtext_to_clearning:
#         tweet = re.sub(f"[{punc}]+",punc.replace("\\",""),tweet)
#     tweet = tweet.replace("#","").strip()
    
#     # Remove the whispaces
#     tweet = " ".join(tweet.split())
    
#     #Expand contractions
#     tweet = list(cont.expand_texts([tweet],
#                                    precise=True))[0]
    
#     if tweet == "":
#         return "."
#     return tweet.lower()

# cleaner_udf = sqlF.udf(clean, StringType())

# train_spark = train_spark.withColumn("clean_text", cleaner_udf("text"))

In [None]:
# gc.collect()

In [None]:
# import wordcloud

# words = train_spark.rdd.flatMap(lambda x: re.split("\s+",x[3]))\
#                   .map(lambda word: (word, 1))\
#                   .reduceByKey(lambda a, b: a + b)

# schema = StructType([StructField("words", StringType(), True),
#                  StructField("count", IntegerType(), True)])

# words_df = sqlContext.createDataFrame(words, schema=schema)

In [None]:
# word_cloud = words_df.orderBy(sqlF.desc("count"))\
#                      .limit(200)\
#                      .toPandas()\
#                      .set_index('words')\
#                      .T\
#                      .to_dict('records')


# wc = wordcloud.WordCloud(background_color="white", max_words=200)
# wc.generate_from_frequencies(dict(*word_cloud))

# plt.figure(figsize=(15,10))
# plt.imshow(wc, interpolation='bilinear')
# plt.show()

# USE Emebddings in Apache Spark

In [None]:
document_assembler = DocumentAssembler()\
                        .setInputCol("clean_text")\
                        .setOutputCol("document")
    
# Download the USE pretrained emebdding
encoder = UniversalSentenceEncoder.pretrained()\
                     .setInputCols(["document"])\
                     .setOutputCol("embeddings")

clf = ClassifierDLApproach()\
          .setInputCols(["embeddings"])\
          .setOutputCol("prediction")\
          .setLabelColumn("target")\
          .setMaxEpochs(30)\
          .setBatchSize(32)

# Create the pipeline with all the transformers above.
pipeline = Pipeline(
    stages = [
        document_assembler,
        encoder,
        clf
    ])

In [None]:
training, valid = train_spark.select("clean_text","target").randomSplit([0.7,0.3], seed=41)

model = pipeline.fit(training)

pred = model.transform(valid)
pandpred = pred.select("target","prediction.result").toPandas()
pandpred.result = pandpred.result.apply(lambda x: x[0])

from sklearn.metrics import classification_report
print(classification_report(pandpred.target, pandpred.result.astype(int)))

In [None]:
# # Retrain with all the training data
# model = pipeline.fit(train_spark)

Test set loading and final predictions!

In [None]:
test_p = pd.read_csv("../input/real-or-not-data-cleaned/test.csv")

test_p = test_p[["id","text", "clean_text"]]

test = spark.createDataFrame(test_p)

# test = test.withColumn("clean_text", cleaner_udf("text"))

# test = test.select("id","text","clean_text")

In [None]:
final = model.transform(test)
final = final.select("id","prediction.result").toPandas()

final.result = final.result.apply(lambda x: x[0]).astype(int)
final.id = final.id.astype(int)

In [None]:
sub = final[["id","result"]]
sub.columns = ["id","target"]
sub.to_csv("submission_use.csv", index=None)

In [None]:
#close the spark session when done
spark.stop()

# BERT Embeddings

In [None]:
for name in dir():
    if not name.startswith('_'):
        del globals()[name]
        
import gc
gc.collect()

In [None]:
import numpy as np 
import pandas as pd 
import re
from pycontractions import Contractions
import gensim.downloader as api

train = pd.read_csv("../input/real-or-not-data-cleaned/train.csv")
test = pd.read_csv("../input/real-or-not-data-cleaned/test.csv")

In [None]:
# model = api.load("glove-twitter-25")

# cont = Contractions(kv_model=model)
# cont.load_models()

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# subtext_to_clearning = ['\!', '\$', '\(', '\)', '\*', 
#                         '\+', '\-', '\.', '\:', '\;', 
#                         '\=', '\?', '\@', '\[', '\]', 
#                         '\^', '\|', '\_', '\{', '\}']

# def clean(tweet):
#     # Remove the mentions (aka tags)
#     tweet = re.sub("@[\w]*", "", tweet)    
# #     tweet = tweet.replace("@","")

#     # Remove more punctuations !!! -> !
#     for punc in subtext_to_clearning:
#         tweet = re.sub(f"[{punc}]+",punc.replace("\\",""),tweet)
#     tweet = tweet.replace("#","").strip()
    
#     # Remove the whispaces
#     tweet = " ".join(tweet.split())
    
#     tweet = list(cont.expand_texts([tweet], precise=True))[0]
    
#     if tweet == "":
#         return "."
#     return tweet.lower()

# train["clean_text"] = train["text"].apply(clean) 
# test["clean_text"] = test["text"].apply(clean)

In [None]:
# del [model, cont]
# gc.collect()

In [None]:
def tokenize_map(sentence, labs='None'):
    
    """A function for tokenize all of the sentences and map the tokens to their word IDs."""
    
    global labels
    
    input_ids = []
    attention_masks = []

    
    for text in sentence:
        
        encoded_dict = tokenizer.encode_plus(
                            text,                        # Sentence to encode.
                            add_special_tokens = True,   # Add '[CLS]' and '[SEP]'
                            truncation='longest_first',  # Activate and control truncation
                            max_length = 64,             # Max length according to our text data.
                            pad_to_max_length = True,    # Pad & truncate all sentences.
                            return_attention_mask = True,# Construct attn. masks.
                            return_tensors = 'pt',       # Return pytorch tensors.
                       )

        # Add the encoded sentence to the id list. 
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if labs != 'None': # Setting this for using this definition for both train and test data so labels won't be a problem in our outputs.
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    else:
        return input_ids, attention_masks

In [None]:
train_sentences = train.clean_text.values
labels = train.target.values

test_sentences = test.clean_text.values

input_ids, attention_masks, labels = tokenize_map(train_sentences, labels)
test_input_ids, test_attention_masks= tokenize_map(test_sentences)

In [None]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.99 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = 32 # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = 32 # Evaluate with this batch size.
        )

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'bert-large-uncased',
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False
)

model.to(device)

In [None]:
gc.collect()

In [None]:
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3

optimizer = AdamW(model.parameters(),
                  lr = 6e-6, 
                  eps = 1e-8 
                )

total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
import time 
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

for epoch_i in range(0, EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    
    t0 = time.time()
    model.train()
    for step, batch in enumerate(train_dataloader):
        
        if ((step+1) % 20 == 0 and not step == 0) or (step+1)==len(train_dataloader):
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step+1, len(train_dataloader), elapsed))
            
        b_input_ids = batch[0].to(device).to(torch.int64)
        b_input_mask = batch[1].to(device).to(torch.int64)
        b_labels = batch[2].to(device).to(torch.int64)
        
        model.zero_grad()
        
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        
        
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():  
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        
        total_eval_loss += loss.item()

        # Move logits and labels to CPU:
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('  Validation Accuracy: {0:.3f}'.format(avg_val_accuracy))


    # Calculate the average loss over all of the batches.
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print('  Validation Loss: {0:.3f}'.format(avg_val_loss))

In [None]:
prediction_data = TensorDataset(test_input_ids, test_attention_masks)

prediction_dataloader = DataLoader(prediction_data, 
                                   sampler=SequentialSampler(prediction_data), 
                                   batch_size=64)

In [None]:
model.eval()

predictions = []

for batch in prediction_dataloader:
    
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, = batch
  
    with torch.no_grad():
   
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]    
    logits = logits.detach().cpu().numpy()
    
    predictions.append(logits)

In [None]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['target'] = flat_predictions

In [None]:
submission.to_csv('submission_bert.csv', index=False, header=True)