In [None]:
!pip install transformers > /dev/null

In [None]:
import pandas as pd
import numpy as np
import os
import torch

import transformers
from transformers import BertTokenizer
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from xgboost import XGBClassifier

from tqdm import tqdm
tqdm.pandas()

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'bert-base-uncased'
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128

    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy()
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
speech0 = pd.read_csv('../input/united-states-presidential-speeches/presidential_speeches.csv',encoding = "ISO-8859-1")

In [None]:
speech0.head()

In [None]:
speech0 = shuffle(speech0)

In [None]:
speech1=speech0.drop(['Party','URL','Date','Speech Title','Summary'],axis=1)
speech=speech1.dropna(axis=0).reset_index(drop=True)

In [None]:
BSV = BertSequenceVectorizer()
speech['ts_feature']=speech['Transcript'].progress_apply(lambda x: BSV.vectorize(x))
speech.head()

In [None]:
print(speech.shape)
print((speech['ts_feature'][0]).shape)

In [None]:
Name=speech['President'].unique()
print(Name)
print(len(Name))

In [None]:
N=[]
for i in range(len(Name)):
    N+=[i]
    
normal_mapping=dict(zip(Name,N)) 
reverse_mapping=dict(zip(N,Name)) 

def mapper(value):
    return reverse_mapping[value]

In [None]:
label=[]
for item in speech['President']:
    label+=[normal_mapping[item]]
print(label[0:3])

In [None]:
ts3=pd.DataFrame()
for i,item in tqdm(enumerate(speech['ts_feature'])):
    for j in range(768):
        ts3.loc[i,j]=item[j]

In [None]:
ts3[0:2]

In [None]:
data=ts3
n=len(data)

In [None]:
train_data = data[0:(n//10)*8]
test_data = data[(n//10)*8:]
train_label = label[0:(n//10)*8]
test_label = label[(n//10)*8:]

In [None]:
y_train0 = train_label
X_train0 = train_data
X_test0 = test_data

In [None]:
X = np.array(X_train0)
y = np.array(y_train0)

In [None]:
clf = XGBClassifier( objective='binary:logistic',max_depth=3,n_estimators=1000,learning_rate=0.01)

In [None]:
ss = ShuffleSplit(n_splits=5,train_size=0.8,test_size=0.2,random_state=0) 

for train_index, test_index in ss.split(X): 
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    clf.fit(X_train, Y_train) 
    print(clf.score(X_test, Y_test))

In [None]:
y_pred = clf.predict(np.array(X_test0))

In [None]:
print(X_test0.shape)
print(y_pred.shape)

In [None]:
ANS=np.array(test_label)
PRED=y_pred

In [None]:
accuracy=accuracy_score(ANS,PRED)
print(accuracy)