In [None]:
!pip install transformers > /dev/null

import pandas as pd
import numpy as np
import os
import torch
import random
import matplotlib.pyplot as plt

import transformers
from transformers import BertTokenizer
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from xgboost import XGBRegressor

from tqdm import tqdm
tqdm.pandas()

In [None]:
data0 = pd.read_csv('../input/reviewuniversalstudio/universal_studio_branches.csv')
data0

In [None]:
data0['rating'].value_counts()

In [None]:
data0.info()

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #self.model_name = '../input/bert-base-uncased'  # Inet-not-connect
        self.model_name = 'bert-base-uncased'          # Inet-connect
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128

    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy()
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
n=len(data0)
print(n)

N=[]
for i in range(n):
    N+=[i]
random.shuffle(N)

In [None]:
BSV = BertSequenceVectorizer()
data0['text_bert']=data0['review_text'].progress_apply(lambda x: BSV.vectorize(x))
data0[0:2]

In [None]:
review=[]
for item in data0['text_bert']:
    review+=[item]

review2=np.array(review)
datay=np.array(data0['rating'])
print(review2.shape)
print(datay.shape)

In [None]:
X_train0=review2[N[0:(n//4)*3]]
y_train0=datay[N[0:(n//4)*3]]
X_test0=review2[N[(n//4)*3:]]
y_test0=datay[N[(n//4)*3:]]

X = np.array(X_train0)[:,0:200]
y = np.array(y_train0)
print(X.shape)
print(y.shape)
testX = np.array(X_test0)[:,0:200]
print(testX.shape)

In [None]:
clf = XGBRegressor(max_depth=3,n_estimators=1000,learning_rate=0.01)
ss = ShuffleSplit(n_splits=5,train_size=0.8,test_size=0.2,random_state=0) 

for train_index, test_index in ss.split(X): 
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    clf.fit(X_train, Y_train) 
    print(clf.score(X_test, Y_test))

In [None]:
y_pred = clf.predict(testX)

In [None]:
PRED=list(y_pred)   
ANS=list(y_test0)
print(PRED[0:10])
print(ANS[0:10])

In [None]:
import matplotlib.pyplot as plt
fig,ax = plt.subplots(figsize=(6,6))
ax.set_title('RATING PREDICTION',fontsize=20)
ax.set_xlabel('ANS',fontsize=12)
ax.set_ylabel('PRED',fontsize=12)
ax.scatter(ANS,PRED)