In [1]:
import psutil
import humanize
import os
import GPUtil as GPU

import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 
import sklearn
from sklearn.metrics import log_loss
from sklearn.metrics import *
from sklearn.model_selection import *
import re
import random
import torch
pd.options.display.max_colwidth = 200

In [2]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_all(2)

In [3]:
import pandas as pd

In [4]:
democrats = pd.read_parquet("../10_datasets/democrats")
republicans = pd.read_parquet("../10_datasets/neutral.parquet")
neutral = pd.read_parquet("../10_datasets/republican.parquet")
df = pd.concat([democrats, republicans, neutral]).reset_index(drop=True)

In [5]:
train = df[["total_post","subreddit"]]

In [6]:
train.subreddit.unique()

array(['democrats', 'NeutralPolitics', 'Republican'], dtype=object)

In [7]:
label_cols = ['democrats', 'NeutralPolitics', 'Republican']
l = ['democrats', 'NeutralPolitics', 'Republican']
train['label']=train.subreddit.astype('category')
Y = train.label.cat.codes
train['label']=Y
print(Y.shape)


(14506,)


In [8]:
def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]


In [9]:
Y = to_categorical(Y, 3)

In [10]:
Y.shape

(14506, 3)

In [11]:
print(Y[0:5])

[[0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]]


In [12]:
for i in range(len(l)) :     
     train[l[i]] = Y[:,i]

In [14]:
%%time
err=[]
y_pred_tot=[]

fold=StratifiedKFold(n_splits=5, shuffle=True, random_state=1997)
i=1
for train_index, test_index in fold.split(train,train['label']):
    train1_trn, train1_val = train.iloc[train_index], train.iloc[test_index]
    model = ClassificationModel('roberta', 'roberta-base', use_cuda=False,num_labels=4, args={
                                                                         'train_batch_size':16,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': 4,
                                                                         'max_seq_length': 128,
                                                                         'regression': False,
                                                                         'manual_seed': 1997,
                                                                         "learning_rate":2e-5,
                                                                         'weight_decay':0,
                                                                         "save_eval_checkpoints": True,
                                                                         "save_model_every_epoch": False,
                                                                         "silent": True})
    model.train_model(train1_trn)
    raw_outputs_val = model.eval_model(train1_val)[1]
    raw_outputs_vals = softmax(raw_outputs_val,axis=1)
    print(f"Log_Loss: {log_loss(train1_val['label'], raw_outputs_vals)}")
    err.append(log_loss(train1_val['label'], raw_outputs_vals))

Downloading: 100%|██████████| 478M/478M [00:19<00:00, 25.5MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

ValueError: too many dimensions 'str'

In [16]:
train1_trn

Unnamed: 0,total_post,subreddit,label,democrats,NeutralPolitics,Republican
0,"There's no such thing as free healthcare. Someone has to pay for it. There is, however, something called single-payer healthcare, which is what Medicare For All is - all healthcare is paid for by ...",democrats,2,0,0,1
1,Figures about the deployment of tests ? Trump's lies about the seriousness of this virus ? \n\nOr just say you vehemently disagree and will avoid discussing for sake of harmony,democrats,2,0,0,1
2,"Money. Personally, I believe the Republicans have been setting this exact system up for the past thirty years. I don't think they care at all about your right to vote or your body or your marriage...",democrats,2,0,0,1
3,"I have family members who get their hair cut.\n\nDoes that mean they don't wear a mask, and think the coronavirus is a hoax???\n\nHow are they hypocrites?\n\nIn MOST of the country salons are open...",democrats,2,0,0,1
4,"Stuttering doesn’t make someone a bad person. Also, if look in the DSM under Narcissistic Personality Disorder, Trump fits all the points and therefore does have an “unsound mind.” Obama/Biden wer...",democrats,2,0,0,1
...,...,...,...,...,...,...
14500,"Dark side After many years, my wife finally decided, without me prompting her, to leave the dark side and register Republican.",Republican,1,0,1,0
14501,"What is it with liberals calling out certain conservatives to go fight in the military, when they would never even step foot in the recruiters office?",Republican,1,0,1,0
14503,The Left Lack of Understanding to the Situation in Iraq is Abhorrent Just head on over to the r/politics sub-reddit and see for yourself.\n\nThey think Trump and anyone who supports or votes for h...,Republican,1,0,1,0
14504,"(Serious) 18 year old here, can you guys tell me about what makes the republican party better than the democrats?",Republican,1,0,1,0


In [17]:
train1_val

Unnamed: 0,total_post,subreddit,label,democrats,NeutralPolitics,Republican
6,"Dude, none of us are impressed by this kind of nonsense. If you and AOC want to convince us, do so by actually getting shit done instead of landing sick burns on twitter.",democrats,2,0,0,1
9,"I'm not really disagreeing with you, but exit polling has shown that even in this election roughly 30-35% of Democratic voters care more about someone who aligns with their views instead of the pe...",democrats,2,0,0,1
18,"""You don't need that money for food, only nobles such as I should have money."" \n\n\nGood God, I hope this is a ""Let Them Eat Cake"" moment for someone",democrats,2,0,0,1
25,It’ll be even emptier next time when half those people get sick and are unable to attend his next bullshit session.,democrats,2,0,0,1
32,Even the woman laughed. Stop making things out of nothing. There are plenty of other real things you can harp on.,democrats,2,0,0,1
...,...,...,...,...,...,...
14472,"Just wanted you all to know if you say anything remotely negative or off color about the Obamas in r /pics, you will get permanently banned like me.",Republican,1,0,1,0
14483,What is the GOP’s justification for denying witnesses and withholding evidence at Trump’s impeachment?,Republican,1,0,1,0
14487,Very mad about Colin Kaepernick disrespecting our BRAVE TROOPS.... Quit whining! Just stand up and sing the National Anthem!,Republican,1,0,1,0
14498,What is the most evil thing or the worst thing Obama's ever done? You can list more things than just one. We all know he has done many stupid things. \nAnd also the best thing or the least bad thi...,Republican,1,0,1,0
