In [None]:
!pip install transformers > /dev/null

In [None]:
import pandas as pd
import numpy as np
import os
import torch
import string
import re
import math
from collections import Counter
import matplotlib.pyplot as plt

import transformers
from transformers import BertTokenizer
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

from xgboost import XGBRegressor

from tqdm import tqdm
tqdm.pandas()

import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #self.model_name = '../input/bert-base-uncased'  # internet-connect-not-required
        self.model_name = 'bert-base-uncased'          # internet-connect-required
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128

    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy()
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
data0 = pd.read_csv('../input/covid19-tweets/covid19_tweets.csv')
data0[0:2]

In [None]:
n=len(data0)
print(n)

In [None]:
data1=data0[0:(n//10)]

In [None]:
label2=data1[['user_favourites']].copy()
data2=data1[['text']].copy()
data2[0:2]

In [None]:
def removeStopwords(text):
    doc = nlp(text)
    clean_text = ' '
    for txt in doc:
        if (txt.is_stop == False):
            clean_text = clean_text + " " + str(txt)        
    return clean_text

def removePunctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def removeLinks(text):
    clean_text = re.sub('https?://\S+|www\.\S+', '', text)
    return clean_text

def removeNumbers(text):
    clean_text = re.sub(r'\d+', '', text)
    return clean_text

In [None]:
def clean(text):
    text = text.lower()
    text = removeStopwords(text)
    text = removePunctuations(text)
    text = removeNumbers(text)
    text = removeLinks(text)
    return text

In [None]:
data2['text_clean'] = data2['text'].apply(clean)
data2[0:2]

In [None]:
BSV = BertSequenceVectorizer()

In [None]:
data2['text_bert']=data2['text_clean'].progress_apply(lambda x: BSV.vectorize(x))

In [None]:
text_bert2=[]
for item in data2['text_bert']:
    text_bert2+=[item]

In [None]:
m=len(text_bert2)
print(m)

In [None]:
X_train0=text_bert2[0:(m//10)*8]
y_train0=label2[0:(m//10)*8]

X_test0=text_bert2[(m//10)*8:]
y_test0=label2[(m//10)*8:]

In [None]:
print(len(X_test0))
print(len(y_test0))

In [None]:
X = (np.array(X_train0))[:,0:20]
y = np.array(y_train0)
print(X.shape)
print(y.shape)

In [None]:
X_test2 = (np.array(X_test0))[:,0:20]
print(X_test2.shape)

In [None]:
clf = XGBRegressor(max_depth=3,n_estimators=1000,learning_rate=0.01)

In [None]:
ss = ShuffleSplit(n_splits=5,train_size=0.8,test_size=0.2,random_state=0) 

for train_index, test_index in ss.split(X): 
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    clf.fit(X_train, Y_train) 
    print(clf.score(X_test, Y_test))

In [None]:
print(X_test2.shape)
y_pred = clf.predict(np.array(X_test2))
print(y_pred.shape)

In [None]:
y_pred2=[]
for item in y_pred:
    y_pred2+=[int(item)]
print(len(y_pred2))

In [None]:
y_test=pd.DataFrame(y_test0)
y_test[1]=y_pred2
y_test.columns=['user_favourites','pred_favourites']
y_test[0:10]

In [None]:
fig, ax = plt.subplots()
x = y_test['user_favourites']
y = y_test['pred_favourites']
ax.scatter(np.log1p(x),np.log1p(y),c='blue',s=20,alpha=0.3,edgecolors='none')
ax.set_xlabel('log1p_user_favourites')
ax.set_ylabel('log1p_pred_favourites')
ax.legend()
ax.grid(True)
plt.show()