In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertConfig
import torch.utils.data as data_utils
import re
#from pytorch_pretrained_bert import BertTokenizer, BertModel, BertConfig
# def pad_sequence(sequence,max_length):
#     if len(sequence)<max_length:
#         sequence += (max_length-len(sequence))*[0]
#     return sequence



def embeddings_from_dataset(X, tokenizer, bert_model):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #X = [tokenizer.tokenize('[CLS] ' + sent + ' [SEP]') for sent in X] # Appending [CLS] and [SEP] tokens - this probably can be done in a cleaner way
    #X_test = [tokenizer.tokenize('[CLS] ' + sent + ' [SEP]') for sent in X_test] # Appending [CLS] and [SEP] tokens - this probably can be done in a cleaner way
    #X = [text[:512] if len(text)>512 else text for text in X]
    #X_test = [text[:512] if len(text)>512 else text for text in X_test]
    #X_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in X]
    encoded_dict = tokenizer.batch_encode_plus([text_preprocessing(post) for post in X],
            padding=True,
            add_special_tokens=True,
            max_length= MAX_LEN,             
            pad_to_max_length=True,
            truncation = True,
            return_tensors='pt',
            return_attention_mask=True)

    token_ids = encoded_dict.get('input_ids')
    attention_mask = encoded_dict.get('attention_mask')
    tokenization_data = data_utils.TensorDataset(token_ids, attention_mask)
    batch_size = 256
    token_data_loader = data_utils.DataLoader(tokenization_data, batch_size=batch_size)

    train_embeddings = []
    #test_embeddings = []

    #results = torch.zeros((len(X_train_tokens), bert_model.config.hidden_size)).long()
    with torch.no_grad():
        for batch_no, data in enumerate(token_data_loader):
            if batch_no%10 == 0:
                print(f"Processed_data : {batch_no*batch_size}")
            
            ids = data[0].to(device)
            masks = data[1].to(device)
            outputs = bert_model(input_ids = ids, attention_mask = masks)
            embeddings = outputs[0][:, 0, :]
            #embeddings = outputs[0][0][0] #This only takes CLS embedding
            train_embeddings.append(embeddings.cpu())
            #results[stidx] = embeddings.cpu()
        
        # for stidx in range(len(X_test)):
        #     tokens = X_test_tokens[stidx]
        #     tokens_t = torch.LongTensor(tokens)#.to(device)
        #     segment_t = torch.LongTensor([1] * len(tokens))#.to(device)
        #     outputs = bert_model(tokens_t.unsqueeze(0),segment_t.unsqueeze(0))
        #     embeddings = outputs[0][0][0] #This only takes CLS embedding
        #     test_embeddings.append(embeddings.cpu())
    return torch.cat(train_embeddings)

def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text
# Load the BERT tokenizer



def train_model(model,optimizer,criterion,loader):
    model.train()
    total_loss = 0
    correct_preds = 0
    for data in loader:
        x = data[0]
        label = data[1]
        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred,label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        prediction = np.where(pred>0.5, 1,0)
        correct_preds += torch.sum(torch.tensor(prediction) == label).item()
    
    return total_loss/len(loader.dataset), correct_preds/len(loader.dataset)    

def test_model(model,criterion,loader):
    model.eval()
    total_loss = 0
    correct_preds = 0
    for data in loader:
        x = data[0]
        label = data[1]
        pred = model(x)
        loss = criterion(pred,label)
        total_loss += loss.item()
        prediction = np.where(pred>0.5, 1,0)
        correct_preds += torch.sum(torch.tensor(prediction) == label).item()
    
    return total_loss/len(loader.dataset), correct_preds/len(loader.dataset)
    
class Bert_Net(nn.Module):
    def __init__(self, bert_model, tokenizer, input_size = 768):
        super().__init__()
        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.fc1 = nn.Linear(input_size,100)
        self.fc2 = nn.Linear(100,50)
        self.fc3 = nn.Linear(50,1)

    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

    def predict(self,x):
        embs = embeddings_from_dataset([x],self.tokenizer, self.bert_model)
        y_hat = self.forward(embs)
        preds = np.where(y_hat>0.5, 1, 0)
        return preds[0][0]





In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MAX_LEN = 64


def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data, tokenizer, max_len=MAX_LEN):
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length= max_len,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            truncation = True,
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs, len_train, freeze_bert=False, pretrained_model = None, side_feature_size = 0):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    if pretrained_model == None:
        bert_classifier = BertClassifier(freeze_bert=freeze_bert, side_feature_size=side_feature_size)
    else:
        bert_classifier = pretrained_model

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len_train * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader,optimizer,scheduler, val_dataloader=None, epochs=4, evaluation=False, use_sf = False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            model.zero_grad()

            if use_sf:
                b_input_ids, b_attn_mask, b_sf, b_labels = tuple(t.to(device) for t in batch)
                logits = model(b_input_ids, b_attn_mask, b_sf)
            else:
                b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
                logits = model(b_input_ids, b_attn_mask)

            # Zero out any previously calculated gradients
            

            # Perform a forward pass. This will return logits.
            

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader, use_sf)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader, use_sf):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        if use_sf:
            b_input_ids, b_attn_mask, b_sf, b_labels = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                logits = model(b_input_ids, b_attn_mask, b_sf)
        else:
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                logits = model(b_input_ids, b_attn_mask)


        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy


# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, side_feature_size=0, freeze_bert=False):

        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768+side_feature_size, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask, side_features=None):

        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]
        if side_features != None:
            classifier_input = torch.cat((last_hidden_state_cls, side_features),dim=1)
        else:
            classifier_input = last_hidden_state_cls
        # Feed input to classifier to compute logits
        logits = self.classifier(classifier_input)

        return logits



In [3]:
#CLASSİCAL METHODS ONLY FEATURES START HERE
dataset = pd.read_csv("LIWC-results.csv")
dataset = dataset.drop("Unnamed: 0",1)
dataset = dataset.drop("Unnamed: 0.1",1)
#dataset = dataset.drop("text",1)
dataset = dataset.dropna()
labels = dataset["class"].values
#datapoints = dataset['text'].values
labels = labels.astype("int")
datapoints = dataset[['text','Clout','WC','WPS','death','Tone']]

from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(datapoints,labels,test_size=0.3,shuffle= True, stratify=labels)
print("len train: ",len(X_train),"len test: ",len(X_val))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train['text'].values,tokenizer)
val_inputs, val_masks = preprocessing_for_bert(X_val['text'].values,tokenizer)

side_features_train = X_train.drop('text',1)
side_features_val = X_val.drop('text',1)

train_labels = torch.LongTensor(y_train)
val_labels = torch.LongTensor(y_val)



  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


len train:  162439 len test:  69618
Tokenizing data...




In [78]:
#CLASSİCAL METHODS ONLY FEATURES START HERE
dataset = pd.read_csv("LIWC-results.csv")
dataset = dataset.drop("Unnamed: 0",1)
dataset = dataset.drop("Unnamed: 0.1",1)
#dataset = dataset.drop("text",1)
dataset = dataset.dropna()
labels = dataset["class"].values
#datapoints = dataset['text'].values
labels = labels.astype("int")
datapoints = dataset

from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(datapoints,labels,test_size=0.3,shuffle= True, stratify=labels)
print("len train: ",len(X_train),"len test: ",len(X_val))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = 'bert-base-uncased'
best_bert_model = torch.load('trained_all_side5.pt')
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)    
bert_model = BertModel.from_pretrained(model_name, config=config)
bert_model = bert_model.to(device)
bert_model.load_state_dict(best_bert_model.bert.state_dict())
bert_model.eval()

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_embeddings = embeddings_from_dataset(X_train['text'].values,tokenizer, bert_model)
val_embeddings = embeddings_from_dataset(X_val['text'].values,tokenizer, bert_model)

side_features_train = X_train.drop('text',1)
side_features_val = X_val.drop('text',1)

train_labels = torch.LongTensor(y_train)
val_labels = torch.LongTensor(y_val)


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


len train:  162439 len test:  69618
Tokenizing data...
Processed_data : 0
Processed_data : 2560
Processed_data : 5120
Processed_data : 7680
Processed_data : 10240
Processed_data : 12800
Processed_data : 15360
Processed_data : 17920
Processed_data : 20480
Processed_data : 23040
Processed_data : 25600
Processed_data : 28160
Processed_data : 30720
Processed_data : 33280
Processed_data : 35840
Processed_data : 38400
Processed_data : 40960
Processed_data : 43520
Processed_data : 46080
Processed_data : 48640
Processed_data : 51200
Processed_data : 53760
Processed_data : 56320
Processed_data : 58880
Processed_data : 61440
Processed_data : 64000
Processed_data : 66560
Processed_data : 69120
Processed_data : 71680
Processed_data : 74240
Processed_data : 76800
Processed_data : 79360
Processed_data : 81920
Processed_data : 84480
Processed_data : 87040
Processed_data : 89600
Processed_data : 92160
Processed_data : 94720
Processed_data : 97280
Processed_data : 99840
Processed_data : 102400
Processe



In [79]:
torch.save({'train_inputs':train_inputs,'train_masks':train_masks,'train_labels':train_labels,'side_features_train':side_features_train,\
    'val_inputs':val_inputs,'val_masks':val_masks,'val_labels':val_labels,'side_features_val':side_features_val},'processed_dataset_sf.pt')

In [80]:
torch.save({'train_embeddings':train_embeddings, 'train_labels':train_labels,'side_features_train':side_features_train,\
    'val_embeddings':val_embeddings,'val_labels':val_labels,'side_features_val':side_features_val},'embedding_dataset_sf.pt')

In [81]:
data = torch.load('embedding_dataset_sf.pt')
train_embeddings = data['train_embeddings']
train_labels = data['train_labels']
side_features_train = data['side_features_train']
val_embeddings = data['val_embeddings']
val_labels = data['val_labels']
side_features_val = data['side_features_val']

In [3]:
data = torch.load('processed_dataset_sf.pt')
train_inputs = data['train_inputs']
train_masks = data['train_masks']
train_labels = data['train_labels']
side_features_train = data['side_features_train']
val_inputs = data['val_inputs']
val_masks = data['val_masks']
val_labels = data['val_labels']
side_features_val = data['side_features_val']

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32
# Create the DataLoader for our training set
train_data = data_utils.TensorDataset(train_inputs, train_masks, torch.Tensor(side_features_train.values), train_labels)
train_sampler = data_utils.RandomSampler(train_data)
train_dataloader = data_utils.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = data_utils.TensorDataset(val_inputs, val_masks, torch.Tensor(side_features_val.values), val_labels)
val_sampler = data_utils.SequentialSampler(val_data)
val_dataloader = data_utils.DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [4]:
best_trained = torch.load('trained_all_best.pt')
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=1,len_train=len(train_dataloader),  freeze_bert=False, side_feature_size=5)
bert_classifier.bert.load_state_dict(best_trained.bert.state_dict())
train(bert_classifier, train_dataloader, optimizer, scheduler, val_dataloader, epochs=1, evaluation=True, use_sf=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.469362   |     -      |     -     |   6.95   
   1    |   40    |   0.176841   |     -      |     -     |   6.06   
   1    |   60    |   0.110799   |     -      |     -     |   6.16   
   1    |   80    |   0.087686   |     -      |     -     |   6.22   
   1    |   100   |   0.115535   |     -      |     -     |   6.23   
   1    |   120   |   0.060649   |     -      |     -     |   6.26   
   1    |   140   |   0.102887   |     -      |     -     |   6.31   
   1    |   160   |   0.153905   |     -      |     -     |   6.22   
   1    |   180   |   0.090460   |     -      |     -     |   6.32   
   1    |   200   |   0.082137   |     -      |     -     |   6.32   
   1    |   220   |   0.089096   |     -      |     -     |   6.51   
   1    |   240   |   0.111459   |     -      |     -     |   6.49   


In [5]:
torch.save(bert_classifier, 'trained_all_side5.pt')

In [7]:
torch.save(bert_classifier,'trained_all_t.pt')

In [15]:
torch.save({'train_inputs':train_inputs,'train_masks':train_masks,'train_labels':train_labels, 'val_inputs':val_inputs,'val_masks':val_masks,'val_labels':val_labels},'processed_dataset.pt')

In [6]:
corpus_reddit = pd.read_csv('reddit_corpus_agree.csv')
corpus_reddit['label'] = 0
corpus_reddit['label'].loc[(corpus_reddit['cls']=='Risk')] = 1 
del corpus_reddit['cls']

batch_size = 1
texts, labels =  corpus_reddit['text'], corpus_reddit['label']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

test_inputs, test_masks = preprocessing_for_bert(texts,tokenizer,max_len=512)
test_labels = torch.LongTensor(labels)

test_data = data_utils.TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = data_utils.SequentialSampler(test_data)
test_dataloader = data_utils.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [11]:
best_without_sf = torch.load('trained_all_best.pt')
best_without_sf.bert.load_state_dict(bert_classifier.bert.state_dict())

<All keys matched successfully>

In [17]:
evaluate(best_without_sf, test_dataloader, use_sf=False)

(0.33895164355635643, 88.88888888888889)

In [18]:
with torch.no_grad():
    false_p = 0
    for data,mask, label in test_dataloader:
        logit = best_without_sf(data.to(device), mask.to(device))
        pred = torch.argmax(logit, dim=1).flatten()

        if pred.cpu() != label:
            false_p+=1
            print(f'prediction: {pred}')
            print(f'true_value: {label}')
            print(tokenizer.decode(data.cpu()[0]))

tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], device='cuda:0')
tensor([1])
tensor([1], devi

In [91]:
dataset = pd.read_csv("test_LIWC.csv")
#dataset = dataset.drop("Unnamed: 0",1)
#dataset = dataset.drop("Unnamed: 0.1",1)
#dataset = dataset.drop("text",1)
dataset = dataset.dropna()
labels = dataset["cls"].values
#datapoints = dataset['text'].values
labels = labels.astype("int")
side_features = dataset[['Clout','WC','WPS','death','Tone']]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#best_bert_model = torch.load('trained_all_side5.pt')
best_bert_model = torch.load('trained_all_best.pt')
sf_bert_model = torch.load('trained_all_side5.pt')
best_bert_model.bert.load_state_dict(sf_bert_model.bert.state_dict())
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

test_inputs, test_masks = preprocessing_for_bert(dataset['text'],tokenizer,max_len=512)
test_labels = torch.LongTensor(labels)
batch_size = 16
#test_data = data_utils.TensorDataset(test_inputs, test_masks, torch.Tensor(side_features.values), test_labels)
test_data = data_utils.TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = data_utils.SequentialSampler(test_data)
test_dataloader = data_utils.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)



In [92]:
evaluate(best_bert_model, test_dataloader, use_sf=False)

(0.49326380163431166, 81.6)

In [35]:

corpus_reddit = pd.read_csv('reddit_corpus_agree.csv')
corpus_reddit['label'] = 0
corpus_reddit['label'].loc[(corpus_reddit['cls']=='Risk')] = 1 
del corpus_reddit['cls']

batch_size = 1
texts, labels =  corpus_reddit['text'], corpus_reddit['label']
#texts = [" ".join(text.split()[:512]) if len(text.split())>512 else text for text in texts]


X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, stratify=corpus_reddit['label'])
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)    
bert_model = BertModel.from_pretrained(model_name, config=config)
bert_model = bert_model.to(device)
bert_model.eval()

train_embeddings = embeddings_from_dataset(X_train, tokenizer, bert_model)
test_embeddings = embeddings_from_dataset(X_test, tokenizer, bert_model)

train_dataset = data_utils.TensorDataset(train_embeddings , torch.FloatTensor(y_train.values).view(-1,1))
test_dataset = data_utils.TensorDataset(test_embeddings , torch.FloatTensor(y_test.values).view(-1,1))

train_loader = data_utils.DataLoader(train_dataset,batch_size=10,shuffle=True)
test_loader = data_utils.DataLoader(test_dataset,batch_size=1)

my_NN = Bert_Net(bert_model, tokenizer)
optimizer = torch.optim.Adam(my_NN.parameters(),lr=0.01)
criterion = nn.BCELoss()

for epoch in range(10):
    train_loss, train_accuracy = train_model(my_NN,optimizer,criterion,train_loader)
    print(f"Epoch {epoch+1}, train_loss: {train_loss}, train_accuracy: {train_accuracy}")
    test_loss, test_accuracy = test_model(my_NN,criterion,test_loader)
    print(f"test_loss: {test_loss}, test_accuracy: {test_accuracy}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


TypeError: __init__() got multiple values for argument 'batch_size'

In [2]:
#CLASSİCAL METHODS ONLY FEATURES START HERE
dataset = pd.read_csv("LIWC-results.csv")
dataset = dataset.drop("Unnamed: 0",1)
dataset = dataset.drop("Unnamed: 0.1",1)
#dataset = dataset.drop("text",1)
dataset = dataset.dropna()
labels = dataset["class"]
datapoints = dataset['text']
labels = labels.astype("int")


from sklearn.model_selection import train_test_split
train_set,test_set,train_labels,test_labels = train_test_split(datapoints,labels,test_size=0.2,shuffle= True, stratify=labels)
print("len train: ",len(train_set),"len test: ",len(test_set))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


len train:  185645 len test:  46412


tensor([[1., 2., 3.],
        [4., 5., 6.],
        [1., 2., 1.],
        [2., 1., 1.],
        [1., 1., 1.],
        [0., 0., 0.]])

In [55]:
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)    
bert_model = BertModel.from_pretrained(model_name, config=config)
bert_model = bert_model.to(device)
bert_model.eval()

train_embeddings = embeddings_from_dataset(train_set.iloc[:2000], tokenizer, bert_model)
test_embeddings = embeddings_from_dataset(test_set.iloc[:400], tokenizer, bert_model)

train_dataset = data_utils.TensorDataset(train_embeddings , torch.FloatTensor(train_labels.iloc[:2000].values).view(-1,1))
test_dataset = data_utils.TensorDataset(test_embeddings , torch.FloatTensor(test_labels.iloc[:400].values).view(-1,1))

train_loader = data_utils.DataLoader(train_dataset,batch_size=10,shuffle=True)
test_loader = data_utils.DataLoader(test_dataset,batch_size=1)



data no : 0
data no : 100
data no : 200
data no : 300
data no : 400
data no : 500
data no : 600
data no : 700
data no : 800
data no : 900
data no : 1000
data no : 1100
data no : 1200
data no : 1300
data no : 1400
data no : 1500
data no : 1600
data no : 1700
data no : 1800
data no : 1900
data no : 0
data no : 100
data no : 200
data no : 300


device(type='cpu')

In [58]:
my_NN = Bert_Net(bert_model, tokenizer)
optimizer = torch.optim.Adam(my_NN.parameters(),lr=0.01)
criterion = nn.BCELoss()

for epoch in range(30):
    train_loss, train_accuracy = train_model(my_NN,optimizer,criterion,train_loader)
    print(f"Epoch {epoch+1}, train_loss: {train_loss}, train_accuracy: {train_accuracy}")
    test_loss, test_accuracy = test_model(my_NN,criterion,test_loader)
    print(f"test_loss: {test_loss}, test_accuracy: {test_accuracy}")

Epoch 1, train_loss: 0.03305102943629026, train_accuracy: 0.868
test_loss: 0.26898272114773136, test_accuracy: 0.89
Epoch 2, train_loss: 0.022328932769130914, train_accuracy: 0.917
test_loss: 0.23238260208097927, test_accuracy: 0.8975
Epoch 3, train_loss: 0.02041545647964813, train_accuracy: 0.922
test_loss: 0.2599868453106285, test_accuracy: 0.9075
Epoch 4, train_loss: 0.018962690729647874, train_accuracy: 0.933
test_loss: 0.28062685343033084, test_accuracy: 0.8975
Epoch 5, train_loss: 0.01739352177793626, train_accuracy: 0.9345
test_loss: 0.2798020233371517, test_accuracy: 0.9
Epoch 6, train_loss: 0.015553986871382222, train_accuracy: 0.9455
test_loss: 0.24166877980842172, test_accuracy: 0.9
Epoch 7, train_loss: 0.013869951889500954, train_accuracy: 0.9465
test_loss: 0.301289434963616, test_accuracy: 0.8875
Epoch 8, train_loss: 0.014024376036133617, train_accuracy: 0.9485
test_loss: 0.2713669410789562, test_accuracy: 0.9125
Epoch 9, train_loss: 0.014319379289678181, train_accuracy: 0

In [41]:
torch.sum(torch.tensor(ne)==to).item()

2

In [16]:
for data, label in test_loader:
    print(label.item()==[[1]].item())

AttributeError: 'list' object has no attribute 'item'

In [29]:
train_labels.iloc[2100]

0

In [217]:
X_train.iloc[8]

'vote up if you think i should live (ha)\r\ntitle is some humour because intergalactic laws dont matter right now\r\n\r\ni dont think i will be alive much longer, a month away seems very far. my last week has been very up and down and isolated and i dont want to be around anyone\r\n\r\nthe one thing that calms me is when i think to myself and decide that i will kill myself, it is strange because then things are very temporarily a little easier to handle.\r\n\r\ni had written up a very long explanation of my life before but it somehow got lost.\r\n\r\nthe only reason i havent yet is guilt of how it will make my parents feel and sister feel. i could see one or more of them killing themselves after or at minimum the rest of their lives being horrible. this world has hurt me so much and my family and they dont deserve to be hurt even more.\r\n\r\ni wont be letting anyone know. part of what hurt me in the past were medications doctors experimented on me. everything made things worse and the

In [218]:
my_NN.predict(X_train.iloc[8])

array([[1]])

In [185]:

deneme = embeddings_from_dataset([X_test.iloc[-6]],tokenizer,bert_model)
with torch.no_grad():
    print(my_NN(deneme))



tensor([[0.9769]])


120    1
54     1
48     1
159    1
135    1
      ..
145    0
59     1
32     1
62     0
24     1
Name: label, Length: 153, dtype: int64

In [173]:
X_train.values[22]

'My secret suicidal thoughts are getting worse. It\'s getting harder to disobey. And I dont want to tell anyone\r\nI\'m one of those super annoying people pleaser types. I\'m constantly doing favors. Listening to other people\'s problems. And forever smiling and trying to keep the positive vibes going.\r\n\r\nI\'m worse than just all that though. Because im an airhead and I dont have good social skills. I\'m always saying or doing something stupid. And it makes me more awkward. I act like it\'s all good and I\'m happy and it doesnt phase me. But that is so far from the truth. \r\n\r\nEvery single day I\'m plagued by thoughts of how weak I am and how I\'m annoying and how I\'m a bitch and "I need to die" over and over and over. Which turns into "I wanna kill myself" over and over. It\'s getting worse every day. I dont expect help. I just needed a place to vent.'

In [172]:
y_train.values[22]

1