In [1]:
import re
import os
import nltk
import tensorflow as tf
import numpy as np
import pandas as pd
from nltk.corpus import reuters
import collections
from transformers import pipeline

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
documents = reuters.fileids()
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
categories = reuters.categories()

<IPython.core.display.Javascript object>

In [3]:
title_re = "([A-Z].*[a-z]$)"


def title_split(doc):
    prev_word = ""
    for word in doc:
        if bool(re.match(title_re, word)):
            title_pos = doc.index(word)
            if prev_word == "A" or word == "USA":
                title_pos = title_pos - 1
            return " ".join(doc[:title_pos])
        prev_word = word


def words_split(doc):
    prev_word = ""
    for word in doc:
        if bool(re.match(title_re, word)):
            title_pos = doc.index(word)
            if prev_word == "A" or word == "USA":
                title_pos = title_pos - 1
            return " ".join(doc[title_pos:])
        prev_word = word

<IPython.core.display.Javascript object>

In [4]:
train_list = [int(x.split("/")[1]) for x in train_docs]
cat_list = [reuters.categories(x) for x in train_docs]
title_list = [title_split(reuters.words(x)) for x in train_docs]
words_list = [words_split(reuters.words(x)) for x in train_docs]

<IPython.core.display.Javascript object>

In [12]:
news_df = pd.DataFrame(
    {
        "DocID": train_list,
        "Categories": cat_list,
        "Title": title_list,
        "Content": words_list,
    }
).sort_values(by="DocID")

<IPython.core.display.Javascript object>

In [13]:
def title_split(row):
    split_word = row["Content"].split()[0]
    title_new = split_word + row["Content"].split(split_word)[1]
    row["Title"] = title_new
    row["Content"] = title_new
    return row

<IPython.core.display.Javascript object>

In [14]:
title_split_df = news_df[news_df["Title"] == ""].apply(title_split, axis=1)
cols = list(news_df.columns)
news_df.loc[news_df.DocID.isin(title_split_df.DocID), cols] = title_split_df[cols]

<IPython.core.display.Javascript object>

In [8]:
cat_dict = {
    "acq": "acuisition",
    "alu": "aluminum",
    "bop": "bottom of the pyramid",
    "castor-oil": "castor oil",
    "carcass": "livestock",
    "cpu": "computershare",
    "coconut-oil": "coconut oil",
    "copra-cake": "copra cake",
    "cpi": "consumer price index",
    "dfl": "deutsche fussball liga",
    "dlr": "digital realty trust",
    "dmk": "dravida munnetra kazhagam",
    "earn": "ellington residential mortgage reit stock",
    "gnp": "grupo nacional provincial",
    "groundnut-oil": "ground nut oil",
    "instal-debt": "instalco intressenter debt",
    "ipi": "intrepid potash",
    "iron-steel": "iron steel",
    "l-cattle": "cattle",
    "lei": "leading economic index",
    "lin-oil": "linseed oil",
    "meal-feed": "meal feed",
    "money-fx": "money foreign exchange",
    "money-supply": "money supply",
    "nat-gas": "natural gas",
    "nkr": "nokia",
    "palm-oil": "palm oil",
    "palmkernel": "palm kernel",
    "pet-chem": "petronas chemicals",
    "rape-oil": "rapeseed oil",
    "soy-meal": "soy meal",
    "soy-oil": "soy oil",
    "strategic-metal": "strategic metal",
    "sun-meal": "sunflower meal",
    "sun-oil": "sunflower oil",
    "sunseed": "sunflower seed",
    "veg-oil": "vagetable oil",
    "wpi": "waterfront philippines",
}

<IPython.core.display.Javascript object>

In [15]:
new_cats = [
    "acuisition",
    "aluminum",
    "bottom of the pyramid",
    "barley",
    "castor oil",
    "carcass",
    "cattle",
    "cocoa",
    "coconut",
    "coconut oil",
    "coffee",
    "consumer price index",
    "copper",
    "copra cake",
    "corn",
    "cotton",
    "computershare",
    "crude",
    "deutsche fussball liga",
    "digital realty trust",
    "dravida munnetra kazhagam",
    "ellington residential mortgage reit stock",
    "fuel",
    "gas",
    "gold",
    "grain",
    "groundnut",
    "ground nut oil",
    "grupo nacional provincial",
    "heat",
    "hog",
    "housing",
    "income",
    "instalco intressenter debt",
    "interest",
    "intrepid potash",
    "iron steel",
    "jet",
    "jobs",
    "lead",
    "leading economic index",
    "linseed oil",
    "livestock",
    "lumber",
    "meal feed",
    "money foreign exchange",
    "money supply",
    "natural gas",
    "naphtha",
    "nickel",
    "nokia",
    "new zealand dollar",
    "oat",
    "orange",
    "palladium",
    "palm oil",
    "palm kernel",
    "petronas chemicals",
    "platinum",
    "potato",
    "propane",
    "rand",
    "rapeseed",
    "rapeseed oil",
    "reserves",
    "retail",
    "rice",
    "rubber",
    "rye",
    "ship",
    "silver",
    "sorghum",
    "soybean",
    "soy meal",
    "soy oil",
    "strategic metal",
    "sugar",
    "sunflower meal",
    "sunflower oil",
    "sunflower seed",
    "tea",
    "tin",
    "trade",
    "vagetable oil",
    "waterfront philippines",
    "wheat",
    "yen",
    "zinc",
]

<IPython.core.display.Javascript object>

In [10]:
classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


<IPython.core.display.Javascript object>

In [39]:
news_df["top_label"] = news_df["Categories"].apply(lambda x: x[0])
news_df["title_length"] = news_df["Title"].apply(lambda x: len(x) if x != None else 0)

<IPython.core.display.Javascript object>

In [42]:
news_filt_df = news_df[news_df["title_length"] > 40]
sample_df = (
    news_filt_df.groupby("top_label")
    .apply(lambda x: x.sample(3, replace=True))
    .reset_index(drop=True)
)
sample_df = sample_df.drop(columns=["Categories"]).drop_duplicates()

<IPython.core.display.Javascript object>

In [43]:
class_title_list = list(news_df["Title"])

Unnamed: 0,DocID,Title,Content,top_label,title_length
0,12225,UPLAND MINERALS ACQUIRES BRAZIL MINING FIRM,Upland Minerals and Chemicals Corp said it agr...,acq,43
1,4204,GUARDIAN - MORTON SHULMAN AGREES TO TAKEOVER B...,Guardian - Morton Shulman Precious Metals Inc ...,acq,55
2,741,FOOTE MINERAL & lt ; FTE > SELLS CAMBRIDGE PLANT,Foote Mineral Co said it signed a letter of in...,acq,48
3,9738,INDIAN PLANT SIGNS FIRST ALUMINA EXPORT CONTRACT,"An unnamed Norwegian firm agreed to buy 100 , ...",alum,48
4,10593,NEW LME ALUMINIUM CONTRACT WELCOMED BY TRADE,"The London Metal Exchange ' s , LME , decision...",alum,44
...,...,...,...,...,...
212,1211,OILS / FATS STOCKS SEEN FALLING SHARPLY IN 198...,Visible stocks of 17 oils and fats are probabl...,veg-oil,52
213,4787,NORWAY ' S WHOLESALE PRICES RISE 0 . 5 PCT IN ...,Norway ' s wholesale price index ( base 1981 )...,wpi,54
214,13232,SOUTH KOREAN WHOLESALE PRICES UP 0 . 2 PCT IN ...,"South Korea ' s wholesale price index , base 1...",wpi,51
216,3163,"HUDSON BAY MINING CUTS U . S ., CANADA ZINC PR...",Hudson Bay Mining and Smelting Co Ltd said it ...,zinc,50


<IPython.core.display.Javascript object>

In [None]:
clfd_dict_list = [
    classifier(title, new_cats, multi_label=True) for title in class_title_list
]
categorized_df = pd.DataFrame.from_records(clfd_dict_list)
categorized_df.to_pickle("C:/Users/rparg/Documents/Data/Reuters/categorized.pkl")