In [1]:
import re
import os
import nltk
import tensorflow as tf
import numpy as np
import pandas as pd
from nltk.corpus import reuters
import collections
from transformers import pipeline

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
documents = reuters.fileids()
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
categories = reuters.categories()

<IPython.core.display.Javascript object>

In [3]:
title_re = "([A-Z].*[a-z]$)"


def title_split(doc):
    prev_word = ""
    for word in doc:
        if bool(re.match(title_re, word)):
            title_pos = doc.index(word)
            if prev_word == "A" or word == "USA":
                title_pos = title_pos - 1
            return " ".join(doc[:title_pos])
        prev_word = word


def words_split(doc):
    prev_word = ""
    for word in doc:
        if bool(re.match(title_re, word)):
            title_pos = doc.index(word)
            if prev_word == "A" or word == "USA":
                title_pos = title_pos - 1
            return " ".join(doc[title_pos:])
        prev_word = word

<IPython.core.display.Javascript object>

In [4]:
train_list = [int(x.split("/")[1]) for x in train_docs]
cat_list = [reuters.categories(x) for x in train_docs]
title_list = [title_split(reuters.words(x)) for x in train_docs]
words_list = [words_split(reuters.words(x)) for x in train_docs]

<IPython.core.display.Javascript object>

In [5]:
news_df = pd.DataFrame(
    {
        "DocID": train_list,
        "Categories": cat_list,
        "Title": title_list,
        "Content": words_list,
    }
).sort_values(by="DocID")

<IPython.core.display.Javascript object>

In [6]:
def title_split(row):
    split_word = row["Content"].split()[0]
    title_new = split_word + row["Content"].split(split_word)[1]
    row["Title"] = title_new
    row["Content"] = title_new
    return row

<IPython.core.display.Javascript object>

In [7]:
title_split_df = news_df[news_df["Title"] == ""].apply(title_split, axis=1)
cols = list(news_df.columns)
news_df.loc[news_df.DocID.isin(title_split_df.DocID), cols] = title_split_df[cols]

<IPython.core.display.Javascript object>

In [9]:
new_cats = [
    "acuisition",
    "aluminum",
    "bottom of the pyramid",
    "barley",
    "castor oil",
    "carcass",
    "cattle",
    "cocoa",
    "coconut",
    "coconut oil",
    "coffee",
    "consumer price index",
    "copper",
    "copra cake",
    "corn",
    "cotton",
    "computershare",
    "crude",
    "deutsche fussball liga",
    "digital realty trust",
    "dravida munnetra kazhagam",
    "ellington residential mortgage reit stock",
    "fuel",
    "gas",
    "gold",
    "grain",
    "groundnut",
    "ground nut oil",
    "grupo nacional provincial",
    "heat",
    "hog",
    "housing",
    "income",
    "instalco intressenter debt",
    "interest",
    "intrepid potash",
    "iron steel",
    "jet",
    "jobs",
    "lead",
    "leading economic index",
    "linseed oil",
    "livestock",
    "lumber",
    "meal feed",
    "money foreign exchange",
    "money supply",
    "natural gas",
    "naphtha",
    "nickel",
    "nokia",
    "new zealand dollar",
    "oat",
    "orange",
    "palladium",
    "palm oil",
    "palm kernel",
    "petronas chemicals",
    "platinum",
    "potato",
    "propane",
    "rand",
    "rapeseed",
    "rapeseed oil",
    "reserves",
    "retail",
    "rice",
    "rubber",
    "rye",
    "ship",
    "silver",
    "sorghum",
    "soybean",
    "soy meal",
    "soy oil",
    "strategic metal",
    "sugar",
    "sunflower meal",
    "sunflower oil",
    "sunflower seed",
    "tea",
    "tin",
    "trade",
    "vagetable oil",
    "waterfront philippines",
    "wheat",
    "yen",
    "zinc",
]

<IPython.core.display.Javascript object>

In [10]:
classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


<IPython.core.display.Javascript object>

In [11]:
news_df["top_label"] = news_df["Categories"].apply(lambda x: x[0])
news_df["title_length"] = news_df["Title"].apply(lambda x: len(x) if x != None else 0)
news_df["con_length"] = news_df["Content"].apply(lambda x: len(x) if x != None else 0)

<IPython.core.display.Javascript object>

In [12]:
news_filt_df = news_df[news_df["title_length"] > 40]
sample_df = (
    news_filt_df.groupby("top_label")
    .apply(lambda x: x.sample(15, replace=True))
    .reset_index(drop=True)
)
sample_df = sample_df.drop(columns=["Categories"]).drop_duplicates()

<IPython.core.display.Javascript object>

In [13]:
class_title_list = list(sample_df["Title"])

<IPython.core.display.Javascript object>

In [14]:
clfd_dict_list2 = [
    classifier(title, new_cats, multi_label=True) for title in class_title_list
]
categorized_df = pd.DataFrame.from_records(clfd_dict_list2)
categorized_df.to_pickle("C:/Users/rparg/Documents/Data/Reuters/categorized2.pkl")

<IPython.core.display.Javascript object>

In [13]:
news_con_filt_df = news_df[
    (news_df["con_length"] > 200) & (news_df["con_length"] < 1000)
]

<IPython.core.display.Javascript object>

In [14]:
sample_con_df = (
    news_con_filt_df.groupby("top_label")
    .apply(lambda x: x.sample(4, replace=True))
    .reset_index(drop=True)
)
sample_con_df = sample_con_df.drop(columns=["Categories"]).drop_duplicates()

<IPython.core.display.Javascript object>

In [15]:
class_con_list = list(sample_con_df["Content"])

<IPython.core.display.Javascript object>

In [16]:
clfd_dict_list2 = [
    classifier(title, new_cats, multi_label=True) for title in class_con_list
]
categorized_df = pd.DataFrame.from_records(clfd_dict_list2)
categorized_df.to_pickle("C:/Users/rparg/Documents/Data/Reuters/con_cat.pkl")

<IPython.core.display.Javascript object>

In [17]:
test_df = pd.read_pickle("C:/Users/rparg/Documents/Data/Reuters/con_cat.pkl")
test_df

Unnamed: 0,sequence,labels,scores
0,United Medical Corp said it has reached a defi...,"[acuisition, trade, interest, crude, lead, res...","[0.8082118630409241, 0.6202386021614075, 0.555..."
1,Scan - Graphics Inc > said it has completed a ...,"[interest, reserves, jobs, rye, trade, acuisit...","[0.9111109972000122, 0.7634175419807434, 0.699..."
2,Allwaste Inc said it entered into an agreement...,"[acuisition, interest, reserves, crude, trade,...","[0.8057773113250732, 0.521743893623352, 0.3815..."
3,Waste Management Inc said it amended its offer...,"[reserves, jobs, jet, coconut, trade, ship, ac...","[0.6777425408363342, 0.6480847001075745, 0.631..."
4,Non - Communist daily average unwrought alumin...,"[aluminum, interest, leading economic index, h...","[0.7101340889930725, 0.631232500076294, 0.2816..."
...,...,...,...
201,Belgian wholesale prices fell by 5 . 9 pct in ...,"[retail, reserves, leading economic index, tra...","[0.5760638117790222, 0.5740767121315002, 0.568..."
202,The wholesale price index rose 1 . 1 pct month...,"[leading economic index, reserves, trade, new ...","[0.03100494109094143, 0.018080731853842735, 0...."
203,Producers zinc stocks in Organization for Econ...,"[zinc, interest, strategic metal, leading econ...","[0.9961469173431396, 0.9147019982337952, 0.634..."
204,Hudson Bay Mining and Smelting Co Ltd said it ...,"[zinc, strategic metal, retail, interest, jobs...","[0.99750816822052, 0.8938364386558533, 0.83465..."


<IPython.core.display.Javascript object>