In [1]:
from visdom import Visdom

class VisdomLinePlotter(object):
    """Plots to Visdom"""
    def __init__(self, env_name='main'):
        self.viz = Visdom()
        self.env = env_name
        self.plots = {}
    def plot(self, var_name, split_name, title_name, x, y):
        if var_name not in self.plots:
            self.plots[var_name] = self.viz.line(X=np.array([x,x]), Y=np.array([y,y]), env=self.env, opts=dict(
                legend=[split_name],
                title=title_name,
                xlabel='Epochs',
                ylabel=var_name
            ))
        else:
            self.viz.line(X=np.array([x]), Y=np.array([y]), env=self.env, win=self.plots[var_name], name=split_name, update = 'append')
            
            
    
vis = VisdomLinePlotter()

Setting up a new session...


In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from pytorch_transformers import AdamW
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline

Using TensorFlow backend.


In [3]:
torch.cuda.is_available()

True

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
# device="cpu"

'Tesla V100-PCIE-16GB'

In [5]:
import numpy as np
import math
import pandas as pd

import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph. 
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import *

from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier

In [6]:
df = pd.read_csv('reddit_train.csv')
df2 = pd.read_csv('reddit_test.csv')
df = df.sample(1000, random_state=1).copy()
df2 = df2.sample(1000, random_state=1).copy()
df.head()

Unnamed: 0,id,comments,subreddits
6670,6670,Yeah but euron's about to bring cersei tyrion ...,gameofthrones
49567,49567,All of his videos are sarcastic and funny...hi...,conspiracy
50796,50796,I love those scenes but it wouldn't have made ...,movies
22310,22310,You do get a smidge of hp for every point of c...,wow
54037,54037,New MMORPG lets you play as someone playing a ...,wow


In [7]:
df['category_id'], mapping = df['subreddits'].factorize()

In [8]:
# Create sentence and label lists
sentences = df.comments.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = df.category_id.values

In [9]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased', do_lower_case=True)

In [10]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['▁yeah', '▁but', '▁euro', 'n', "'", 's', '▁about', '▁to', '▁bring', '▁', 'cer', 's', 'ei', '▁', 'ty', 'rion', '▁as', '▁a', '▁gift', '▁[', 's', 'ep', ']', '▁[', 'cl', 's', ']']


In [11]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 512

In [12]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [13]:
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [14]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [15]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.05)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.05)

In [16]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [17]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 2

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [18]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels=20)
model.cuda()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 1024)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=1024, out_features=4096, bias=True)
          (layer_2): Linear(in_features=4096, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1024,),

In [19]:

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]



In [20]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5)

In [21]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4
step = 0

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    
    
    # Training
    
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    # Train the data for one epoch
    for batch in tqdm(train_dataloader,desc='batch',leave=False):
        step = step + 1
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss_set.append(loss.item()) 
        vis.plot('loss', 'train_loss', 'Loss',step,loss.item())   
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))
        
        
    # Validation

    if(step%500==0):

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                logits = output[0]
            
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
        vis.plot('accuracy', 'val_acc', 'val_acc',step,eval_accuracy/nb_eval_steps)
        torch.save(model, 'random_model.pt')

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]
batch:   0%|          | 0/475 [00:00<?, ?it/s][A
batch:   0%|          | 1/475 [00:00<04:56,  1.60it/s][A
batch:   0%|          | 2/475 [00:01<04:54,  1.61it/s][A
batch:   1%|          | 3/475 [00:01<04:54,  1.60it/s][A
batch:   1%|          | 4/475 [00:02<04:53,  1.61it/s][A
batch:   1%|          | 5/475 [00:03<04:53,  1.60it/s][A
batch:   1%|▏         | 6/475 [00:03<04:52,  1.60it/s][A
batch:   1%|▏         | 7/475 [00:04<04:49,  1.62it/s][A
batch:   2%|▏         | 8/475 [00:04<04:49,  1.61it/s][A
batch:   2%|▏         | 9/475 [00:05<04:47,  1.62it/s][A
batch:   2%|▏         | 10/475 [00:06<04:46,  1.62it/s][A
batch:   2%|▏         | 11/475 [00:06<04:46,  1.62it/s][A
batch:   3%|▎         | 12/475 [00:07<04:45,  1.62it/s][A
batch:   3%|▎         | 13/475 [00:08<04:46,  1.61it/s][A
batch:   3%|▎         | 14/475 [00:08<04:45,  1.61it/s][A
batch:   3%|▎         | 15/475 [00:09<04:45,  1.61it/s][A
batch:   3%|▎         | 16/47

batch:  29%|██▉       | 137/475 [01:24<03:30,  1.61it/s][A
batch:  29%|██▉       | 138/475 [01:25<03:29,  1.61it/s][A
batch:  29%|██▉       | 139/475 [01:26<03:29,  1.60it/s][A
batch:  29%|██▉       | 140/475 [01:26<03:28,  1.60it/s][A
batch:  30%|██▉       | 141/475 [01:27<03:29,  1.60it/s][A
batch:  30%|██▉       | 142/475 [01:27<03:29,  1.59it/s][A
batch:  30%|███       | 143/475 [01:28<03:27,  1.60it/s][A
batch:  30%|███       | 144/475 [01:29<03:26,  1.60it/s][A
batch:  31%|███       | 145/475 [01:29<03:25,  1.60it/s][A
batch:  31%|███       | 146/475 [01:30<03:25,  1.60it/s][A
batch:  31%|███       | 147/475 [01:31<03:24,  1.60it/s][A
batch:  31%|███       | 148/475 [01:31<03:23,  1.61it/s][A
batch:  31%|███▏      | 149/475 [01:32<03:23,  1.60it/s][A
batch:  32%|███▏      | 150/475 [01:32<03:23,  1.60it/s][A
batch:  32%|███▏      | 151/475 [01:33<03:21,  1.61it/s][A
batch:  32%|███▏      | 152/475 [01:34<03:21,  1.60it/s][A
batch:  32%|███▏      | 153/475 [01:34<0

batch:  57%|█████▋    | 273/475 [02:50<02:23,  1.41it/s][A
batch:  58%|█████▊    | 274/475 [02:51<02:24,  1.39it/s][A
batch:  58%|█████▊    | 275/475 [02:51<02:30,  1.33it/s][A
batch:  58%|█████▊    | 276/475 [02:52<02:28,  1.34it/s][A
batch:  58%|█████▊    | 277/475 [02:53<02:25,  1.36it/s][A
batch:  59%|█████▊    | 278/475 [02:54<02:27,  1.34it/s][A
batch:  59%|█████▊    | 279/475 [02:54<02:25,  1.35it/s][A
batch:  59%|█████▉    | 280/475 [02:55<02:25,  1.34it/s][A
batch:  59%|█████▉    | 281/475 [02:56<02:26,  1.32it/s][A
batch:  59%|█████▉    | 282/475 [02:57<02:22,  1.35it/s][A
batch:  60%|█████▉    | 283/475 [02:57<02:19,  1.38it/s][A
batch:  60%|█████▉    | 284/475 [02:58<02:16,  1.40it/s][A
batch:  60%|██████    | 285/475 [02:59<02:19,  1.36it/s][A
batch:  60%|██████    | 286/475 [03:00<02:24,  1.31it/s][A
batch:  60%|██████    | 287/475 [03:00<02:20,  1.34it/s][A
batch:  61%|██████    | 288/475 [03:01<02:14,  1.39it/s][A
batch:  61%|██████    | 289/475 [03:02<0

batch:  86%|████████▌ | 409/475 [04:24<00:43,  1.51it/s][A
batch:  86%|████████▋ | 410/475 [04:25<00:42,  1.51it/s][A
batch:  87%|████████▋ | 411/475 [04:26<00:44,  1.44it/s][A
batch:  87%|████████▋ | 412/475 [04:26<00:43,  1.44it/s][A
batch:  87%|████████▋ | 413/475 [04:27<00:42,  1.46it/s][A
batch:  87%|████████▋ | 414/475 [04:28<00:43,  1.42it/s][A
batch:  87%|████████▋ | 415/475 [04:28<00:42,  1.42it/s][A
batch:  88%|████████▊ | 416/475 [04:29<00:42,  1.37it/s][A
batch:  88%|████████▊ | 417/475 [04:30<00:41,  1.40it/s][A
batch:  88%|████████▊ | 418/475 [04:31<00:40,  1.42it/s][A
batch:  88%|████████▊ | 419/475 [04:31<00:39,  1.41it/s][A
batch:  88%|████████▊ | 420/475 [04:32<00:40,  1.35it/s][A
batch:  89%|████████▊ | 421/475 [04:33<00:39,  1.38it/s][A
batch:  89%|████████▉ | 422/475 [04:33<00:37,  1.43it/s][A
batch:  89%|████████▉ | 423/475 [04:34<00:37,  1.39it/s][A
batch:  89%|████████▉ | 424/475 [04:35<00:36,  1.41it/s][A
batch:  89%|████████▉ | 425/475 [04:36<0

Train loss: 3.0982445480949



batch:   0%|          | 1/475 [00:00<05:23,  1.46it/s][A
batch:   0%|          | 2/475 [00:01<05:20,  1.48it/s][A
batch:   1%|          | 3/475 [00:01<05:15,  1.50it/s][A
batch:   1%|          | 4/475 [00:02<05:12,  1.50it/s][A
batch:   1%|          | 5/475 [00:03<05:37,  1.39it/s][A
batch:   1%|▏         | 6/475 [00:04<05:33,  1.40it/s][A
batch:   1%|▏         | 7/475 [00:04<05:24,  1.44it/s][A
batch:   2%|▏         | 8/475 [00:05<05:16,  1.47it/s][A
batch:   2%|▏         | 9/475 [00:06<05:11,  1.50it/s][A
batch:   2%|▏         | 10/475 [00:06<05:14,  1.48it/s][A
batch:   2%|▏         | 11/475 [00:07<05:16,  1.47it/s][A
batch:   3%|▎         | 12/475 [00:08<05:16,  1.46it/s][A
batch:   3%|▎         | 13/475 [00:08<05:12,  1.48it/s][A
batch:   3%|▎         | 14/475 [00:09<05:09,  1.49it/s][A
batch:   3%|▎         | 15/475 [00:10<05:08,  1.49it/s][A
batch:   3%|▎         | 16/475 [00:10<05:07,  1.49it/s][A
batch:   4%|▎         | 17/475 [00:11<05:08,  1.48it/s][A
batch

batch:  29%|██▉       | 139/475 [01:36<04:23,  1.27it/s][A
batch:  29%|██▉       | 140/475 [01:37<04:24,  1.27it/s][A
batch:  30%|██▉       | 141/475 [01:38<04:16,  1.30it/s][A
batch:  30%|██▉       | 142/475 [01:39<04:04,  1.36it/s][A
batch:  30%|███       | 143/475 [01:39<04:04,  1.36it/s][A
batch:  30%|███       | 144/475 [01:40<04:07,  1.34it/s][A
batch:  31%|███       | 145/475 [01:41<04:04,  1.35it/s][A
batch:  31%|███       | 146/475 [01:41<04:01,  1.36it/s][A
batch:  31%|███       | 147/475 [01:42<04:07,  1.32it/s][A
batch:  31%|███       | 148/475 [01:43<04:14,  1.28it/s][A
batch:  31%|███▏      | 149/475 [01:44<04:13,  1.29it/s][A
batch:  32%|███▏      | 150/475 [01:45<04:04,  1.33it/s][A
batch:  32%|███▏      | 151/475 [01:45<04:11,  1.29it/s][A
batch:  32%|███▏      | 152/475 [01:46<04:13,  1.28it/s][A
batch:  32%|███▏      | 153/475 [01:47<04:03,  1.32it/s][A
batch:  32%|███▏      | 154/475 [01:48<04:01,  1.33it/s][A
batch:  33%|███▎      | 155/475 [01:48<0

KeyboardInterrupt: 

In [21]:
df2.comments.values.shape

(30000,)

In [22]:
# df2 = df2.iloc[:1000]

In [23]:
for i in range(df2.comments.values.shape[0]):
    df2.comments.values[i] = df2.comments.values[i][:512]

In [24]:
len(sentences)

1000

In [25]:
# Create sentence and label lists
sentences = df2.comments.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

In [26]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 



In [27]:
input_ids.shape

(30000, 512)

In [28]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
# prediction_labels = torch.tensor(labels)
  
batch_size = 32


prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [30]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in tqdm(prediction_dataloader,desc='batch',leave=False):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = output[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()

    # Store predictions and true labels
    predictions.append(logits)

batch:  41%|████      | 381/938 [07:08<10:58,  1.18s/it]

KeyboardInterrupt: 

In [31]:
main_preds = []
for i in range(len(predictions)):
    main_preds += list(np.argmax(predictions[i], axis=1))
    
print(len(main_preds))

12192


In [32]:
main_preds

[19,
 16,
 1,
 16,
 4,
 14,
 10,
 6,
 4,
 16,
 4,
 12,
 13,
 17,
 10,
 14,
 18,
 8,
 19,
 14,
 5,
 17,
 15,
 19,
 15,
 4,
 3,
 12,
 8,
 13,
 1,
 9,
 1,
 14,
 12,
 12,
 10,
 9,
 19,
 17,
 0,
 12,
 16,
 19,
 12,
 1,
 17,
 15,
 4,
 11,
 2,
 6,
 17,
 5,
 10,
 17,
 18,
 13,
 16,
 0,
 8,
 4,
 4,
 13,
 17,
 7,
 1,
 4,
 13,
 9,
 2,
 13,
 6,
 8,
 17,
 17,
 16,
 0,
 4,
 6,
 14,
 17,
 2,
 8,
 17,
 11,
 6,
 11,
 0,
 19,
 13,
 18,
 17,
 17,
 6,
 15,
 17,
 8,
 4,
 0,
 8,
 8,
 19,
 15,
 8,
 11,
 17,
 11,
 2,
 16,
 3,
 19,
 14,
 1,
 11,
 18,
 9,
 17,
 17,
 3,
 10,
 14,
 10,
 16,
 16,
 15,
 15,
 13,
 2,
 18,
 19,
 0,
 10,
 9,
 18,
 18,
 13,
 0,
 8,
 3,
 12,
 16,
 3,
 4,
 16,
 11,
 4,
 9,
 12,
 16,
 3,
 13,
 3,
 10,
 19,
 17,
 12,
 5,
 17,
 6,
 6,
 7,
 4,
 11,
 14,
 12,
 15,
 10,
 18,
 9,
 8,
 0,
 15,
 17,
 7,
 3,
 12,
 12,
 5,
 0,
 15,
 5,
 10,
 10,
 5,
 0,
 4,
 17,
 16,
 11,
 17,
 10,
 2,
 6,
 16,
 4,
 2,
 9,
 12,
 11,
 10,
 4,
 19,
 17,
 19,
 10,
 9,
 8,
 12,
 19,
 18,
 4,
 13,
 2,
 6,
 10,
 10,
 19,

In [46]:
test_preds = pd.DataFrame()
test_preds['Id'] = df2['id']
test_preds['Category'] = mapping[main_preds]

In [47]:
test_preds

Unnamed: 0,Id,Category
0,0,baseball
1,1,europe
2,2,anime
3,3,worldnews
4,4,funny
...,...,...
29995,29995,movies
29996,29996,movies
29997,29997,Overwatch
29998,29998,gameofthrones


In [48]:
test_preds.to_csv("test2.csv", index=False)
from IPython.display import FileLink, FileLinks
FileLink('test2.csv')

In [20]:
model.load_state_dict(a.state_dict())

<All keys matched successfully>

In [19]:
a = torch.load('xlnet_model.pt')