# Imports

In [1]:
import re, unicodedata, numpy as np, pandas as pd, pickle, os, math

from numpy import random
from google.colab import files

from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Download from kaggle

In [None]:
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

Saving kaggle.json to kaggle.json


In [None]:
! chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d mswarbrickjones/reddit-selfposts

Downloading reddit-selfposts.zip to /content
 98% 345M/352M [00:04<00:00, 97.4MB/s]
100% 352M/352M [00:04<00:00, 86.4MB/s]


In [None]:
! unzip reddit-selfposts.zip -d data

Archive:  reddit-selfposts.zip
  inflating: data/rspct.tsv          
  inflating: data/subreddit_info.csv  


# Data Processing Functions

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    text = re.sub(r'[^A-Za-z\'\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [None]:
def get_data(path):
  print('Getting data')
  raw_df = pd.read_table(path+'rspct.tsv')
  sub_df = pd.read_csv(path+'subreddit_info.csv')

  print('data read in. starting dataframe join:')

  df = pd.merge(raw_df, sub_df, how='inner',on='subreddit')\
            [['category_1','selftext']]\

  print('dataframes joined. starting text preprocessing:')

  df.selftext = df.selftext.apply(preprocess_text)

  df.category_1 = df.category_1.apply(lambda x: x.replace('/','&').replace(' ','').replace(' lb',''))

  print('Text processed.')
  return df


In [7]:
cats = ['writing&stories',
'tv_show',
'autos',
'hardware&tools',
'electronics']
# 'video_game',
# 'crypto',
# 'sports',
# 'hobby',
# 'appearance'
# ,
# 'card_game',
# 'drugs',
# 'advice&question',
# 'social_group',
# 'anime&manga',
# 'sex&relationships',
# 'software',
# 'health',
# 'animals',
# 'arts',
# 'programming',
# 'rpg',
# 'books',
# 'parenting',
# 'education',
# 'company&website',
# 'profession',
# 'music',
# 'politics&viewpoint',
# 'stem',
# 'travel',
# 'geo',
# 'religion&supernatural',
# 'board_game',
# 'movies',
# 'food&drink',
# 'finance&money',
# 'meta']

# Set paths

In [3]:
raw_data_path = 'data/'
drive_path = '/content/gdrive/MyDrive/CIS 522 Final Project/'

# Writing data to directory

In [None]:
def prep_data(df, drive_path):
  print('Saving processed data:')

  num_cats = len(cats)

  for i, cat in enumerate(cats):
    print('Category {}/{}:{}'.format(i,num_cats,cat))

    data = df[df.category_1 == cat].selftext

    data.to_csv(drive_path + 'data/' + cat + '.csv',index=False,header=True)
  print('Done')

In [None]:
df = get_data(raw_data_path)
prep_data(df, drive_path)

Getting data
data read in. starting dataframe join:
dataframes joined. starting text preprocessing:
Text processed.
Saving processed data:
Category 0/5:writing&stories
Category 1/5:tv_show
Category 2/5:autos
Category 3/5:hardware&tools
Category 4/5:electronics
Done


# Defining training

In [4]:
#Downloading proper software
!pip install transformers
!pip install datasets

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 19.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 52.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 49.0MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting datasets
[?25l  Downloading https://files.pythonhoste

In [5]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling,AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import Dataset

def train_models(cats, drive_path):
  print('Training models:')
 
  num_cats = len(cats)

  for i, cat in enumerate(cats, 1):
    print('Category {}/{}:{}'.format(i,num_cats,cat))
    tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained('distilgpt2').to('cuda')
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


    data = Dataset.from_csv(drive_path + 'data/'+ cat + '.csv', split='train')

    def encode(batch): 
      return tokenizer([x[:500] for x in batch['selftext']], truncation=True, padding=True)

    processed = data.map(encode, batched=True, with_indices= False, batch_size=len(data))
    processed.set_format('torch', columns=['input_ids', 'attention_mask'])

    training_args = TrainingArguments(
        output_dir='./content/',
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        logging_steps=100,
        weight_decay=0.01,
        logging_dir='./logs/'
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        data_collator=data_collator,
        train_dataset=processed,
    )

    trainer.train()
    trainer.save_model(drive_path + 'GPT_generators/' + cat + '/')

# Training models

In [None]:
train_models([cats[0]],drive_path)

Training models:
Category 1/1:writing&stories


Using custom data configuration default-447a5a1dae6b9d9d
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-447a5a1dae6b9d9d/0.0.0)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-447a5a1dae6b9d9d/0.0.0/cache-e479a7c8abf6731f.arrow


Step,Training Loss
100,4.5883
200,4.4426
300,4.3683
400,4.3165
500,4.3062
600,4.2911
700,4.2875
800,4.2361
900,4.2189
1000,4.2425


In [None]:
train_models(cats,drive_path)

Training models:
Category 1/5:writing&stories


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=352833716.0, style=ProgressStyle(descri…




Using custom data configuration default-447a5a1dae6b9d9d


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-447a5a1dae6b9d9d/0.0.0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-447a5a1dae6b9d9d/0.0.0. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Step,Training Loss
100,4.5896
200,4.4454
300,4.3737
400,4.3245
500,4.3168
600,4.3044
700,4.3068
800,4.2592
900,4.2508
1000,4.2818


Category 2/5:tv_show


Using custom data configuration default-c83b27eec64cf0dd


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-c83b27eec64cf0dd/0.0.0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c83b27eec64cf0dd/0.0.0. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Step,Training Loss
100,4.6422
200,4.4975
300,4.4452
400,4.3957
500,4.3808
600,4.3914
700,4.3482
800,4.3406
900,4.3362
1000,4.3264


Category 3/5:autos


Using custom data configuration default-5ac6b40addd60e02


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-5ac6b40addd60e02/0.0.0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5ac6b40addd60e02/0.0.0. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Step,Training Loss
100,4.5297
200,4.3163
300,4.23
400,4.193
500,4.1541
600,4.1478
700,4.1182
800,4.1185
900,4.1052
1000,4.0924


Category 4/5:hardware&tools


Using custom data configuration default-654bb8bb35167114


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-654bb8bb35167114/0.0.0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-654bb8bb35167114/0.0.0. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Step,Training Loss
100,4.6058
200,4.4197
300,4.3421
400,4.2671
500,4.2831
600,4.2316
700,4.2218
800,4.2309
900,4.1876
1000,4.1261


Category 5/5:electronics


Using custom data configuration default-d437147d7628a3ee


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-d437147d7628a3ee/0.0.0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d437147d7628a3ee/0.0.0. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Step,Training Loss
100,4.5788
200,4.4027
300,4.3312
400,4.2718
500,4.2838
600,4.2327
700,4.2257
800,4.2144
900,4.2077
1000,4.1867


In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


# Text generation

In [None]:
for cat in cats[:10]:
  print('Category:',cat)
  mdl = pipeline('text-generation', 
                  model=drive_path + 'GPT_generators/' + cat + '/', 
                  device=0)
  print('Text:',mdl('What I think ')[0]['generated_text'],'\n')

Category: writing&stories


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: What I think ive read so far is a fantastic description of the project i'm really looking forward to doing lb lb i'm looking for a writer to fill the role of the girl lb lb lb if you have any other ideas in mind please drop 

Category: tv_show


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: What I think ive heard has been a relatively new phenomenon i've been following for awhile and i feel like the rise in frequency of stories on the web and in my experience a lot of people want to share in the world lb lb for example the 

Category: autos


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: What I think ive been driving a lot ever since the 's was an excellent toyota for years and a few times i had a hard time getting past the clutches and it seemed obvious that the gs needed for larger vehicles like the i 

Category: hardware&tools


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: What I think ive just made a mistake i'm planning on replacing the scooter i've owned since i was a kid but im looking for something that's both fun and fun and i need to carry a scooter i've never had a sc 

Category: electronics


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: What I think ive tried and tried the zx gopro and the xy and ive tried them as a side note all my favorite features were the motion detection for the xy and the motion detection for the xy and xy 



#Perplexity Calculation

In [83]:
import torch
import numpy as np
text = "I just finished Scandal and I'm looking for a new show."
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token
    
for cat in cats:
  print("Category:",cat)
  model = AutoModelForCausalLM.from_pretrained(drive_path + 'GPT_generators/' + cat + '/')
  encodings = tokenizer(text, return_tensors='pt')

  max_length = model.config.n_positions
  stride = 512

  lls = []
  for i in range(0, encodings.input_ids.size(1), stride):
      begin_loc = max(i + stride - max_length, 0)
      end_loc = min(i + stride, encodings.input_ids.size(1))
      trg_len = end_loc - i    # may be different from stride on last loop
      input_ids = encodings.input_ids[:,begin_loc:end_loc]
      target_ids = input_ids.clone()
      target_ids[:,:-trg_len] = -100

      with torch.no_grad():
          outputs = model(input_ids, labels=target_ids)
          log_likelihood = outputs[0] * trg_len

      lls.append(log_likelihood)

  ppl = torch.exp(torch.stack(lls).sum() / end_loc)
  print(ppl.item(),'\n')

Category: writing&stories
254.6935272216797 

Category: tv_show
51.117374420166016 

Category: autos
177.2567901611328 

Category: hardware&tools
122.44670104980469 

Category: electronics
115.01026153564453 

