In [1]:
import os
from pathlib import Path
import re

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from models import AttentionLSTM, BertForClassification, TextCNN, XLNetForClassification
from utils.preprocess import load_emb_matrix, Tokenizer, GloveTokenizer, WordPieceTokenizer
from utils.common import seed_everything, init_logger, to_device
import warnings
warnings.simplefilter("ignore")

In [2]:
def load_ceo_datasets(ceo_dir):
    ceo_csvs = ceo_dir.iterdir()
    for i, csv in enumerate(ceo_csvs):
        ceo_file = pd.read_csv(csv, encoding="euc-kr").dropna(axis=0)
        yield csv.name, ceo_file


def process_df(df):
    # find ceo name
    ceo_name = ""
    for line in df[df.role=="Executives"].text:
        for role in line.split("\n"):
            if "ceo" in role.lower() or "Chief Executive Officer".lower() in role.lower():
                twoparts = role.split(" - ")
                if len(twoparts) == 2:
                    ceo_name = twoparts[0].strip()
                break
        break
    if ceo_name == "":
        for line in df[df.role=="Executives"].text:
            for role in line.split("\n"):
                if "ceo" in role.lower() or "Chief Executive Officer".lower() in role.lower():
                    twoparts = role.split("?")
                    if len(twoparts) == 2:
                        ceo_name = twoparts[0].strip()
                    break
            break
            
    if ceo_name == "":
        for line in df[df.role=="Executives"].text:
            for role in line.split("\n"):
                if "ceo" in role.lower() or "Chief Executive Officer".lower() in role.lower():
                    twoparts = role.split(" -- ")
                    if len(twoparts) == 2:
                        ceo_name = twoparts[0].strip()
                    break
            break
    if "." in ceo_name:
        ceo_name = " ".join([ceo_name.split()[0], ceo_name.split()[-1]])

    df2use = df[df.role.str.lower()==ceo_name.lower()]
    if len(df2use) == 0:
        raise ValueError("Ceo not found")
    
    df2use.companyTicker = df2use.companyTicker.str.split(":").str[-1]
    
    df2use["year"] = pd.to_datetime(df["startDate"]).dt.year.astype(str)
    # business quarter is different from natrual quarter, thus map natural quarter to business quarter
    # df2use["quarter"] = "Q" + pd.to_datetime(df["startDate"]).dt.quarter.astype(str) --error!
    # use regex to search it from title
    # search pattern F1Q10
    
    def extract_quarter(title):
        pattern1_result = re.search(r"F[1-4]Q[0-9]+", title)
        if pattern1_result is not None:
            quarter = pattern1_result.group(0)
        else:
            # search pattern Q1 Q2..
            pattern2_result = re.search(r"Q[1-4]", title)
            if pattern2_result is not None:
                quarter = pattern2_result.group(0)
            else:
                quarter = ""
        return quarter
    
    df2use["quarter"] = df2use["title"].apply(extract_quarter)
    
    df2use["year_quarter"] = df2use["year"] + df2use["quarter"]
    periods = sorted(list(set(df2use["year_quarter"].tolist())))
    outputs = []
    for p in periods:
        part = df2use[df2use["year_quarter"]==p]
        Id = part.id.tolist()[0]
        title = part.title.tolist()[0]
        year = part.year.tolist()[0]
        quarter = part.quarter.tolist()[0]
        company_name = part.companyName.tolist()[0]
        company_ticker = part.companyTicker.tolist()[0]
        description = part.description.tolist()[0]
        role = "Chief Executive Officer"
        name = ceo_name
        texts = part.text.tolist()
        text = "[SEP]".join(texts)
        outputs.append(
            (Id, title, year, quarter, company_name, company_ticker, description, role, ceo_name, text)
        )
    output_df = pd.DataFrame(outputs, columns=["id", "title", "year", "quarter", "companyName", "companyTicker", "description", "role", "ceo_name", "text"])
    return output_df
        

class CEODataset(Dataset):
    def __init__(self, 
                 csv_file: pd.DataFrame, 
                 tokenizer: Tokenizer
                ):
        super().__init__()
        self.csv_file = csv_file
        self.tokenizer = tokenizer
    
    def __getitem__(self, i):
        line = self.csv_file.iloc[i]
        sent = line.text.split("[SEP]")
        sent = ".".join([s[:150] for s in sent])
        
        input_ids, mask = self.tokenizer([sent])
        outputs = {
            "input_ids": torch.LongTensor(input_ids).squeeze(0),
            "mask": torch.LongTensor(mask).squeeze(0)
        }
        return outputs
        
    def __len__(self):
        return self.csv_file.shape[0]

def make_prediction(df, model, tokenizer, device):
    model.eval()
    with torch.no_grad():
        processed_df = process_df(df)
        dataset = CEODataset(processed_df, tokenizer)
        dataloader = DataLoader(dataset, batch_size=4)
        outputs = []
        for batch in dataloader:
            batch = to_device(batch, device)
            y_pred = model(batch["input_ids"], batch["mask"])
            outputs += torch.sigmoid(y_pred).flatten().cpu().tolist()
    output_df = processed_df.copy()
    output_df["model"] = model.__class__.__name__
    output_df["y_pred"] = outputs
    return output_df

In [3]:
glove_path = Path(os.path.curdir) / "pretrain_weight/glove.6B.200d.txt" 
emb_matrix = torch.FloatTensor(load_emb_matrix(glove_path))
ceo_dir = Path(os.path.curdir) / "raw/CEO/Dataset/2010"

In [4]:
os.path.curdir
Path(os.path.curdir)

WindowsPath('.')

In [5]:
ceo_dir = Path(os.path.curdir) / "raw/CEO/Dataset/2010"

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
attention_lstm_ckpt = str(Path(os.path.curdir) / "outputs/checkpoints/AttentionLSTM/epoch_3_stem_False_lemmatize_False.pt" )
textcnn_ckpt = str(Path(os.path.curdir) / "outputs/checkpoints/TextCNN/epoch_3_stem_False_lemmatize_False.pt")
bert_ckpt = str(Path(os.path.curdir) / "outputs/checkpoints/BertForClassification/epoch_3_stem_False_lemmatize_False.pt")
xlnet_ckpt = str(Path(os.path.curdir) / "outputs/checkpoints/XLNetForClassification/epoch_3_stem_False_lemmatize_False.pt")
    
# model = AttentionLSTM(
#     vocab_size=emb_matrix.shape[0], 
#     emb_dim=200, 
#     hidden_size=256, 
#     output_size=1,
#     num_layers=1,
#     emb_matrix=emb_matrix
# ).to(device)
# model.load_state_dict(torch.load(attention_lstm_ckpt))

# model = TextCNN(
#     vocab_size=emb_matrix.shape[0], 
#     emb_dim=200,
#     kernel_size=3,
#     out_channels=256,
#     output_size=1,
#     num_layers=1,
#     emb_matrix=emb_matrix
# ).to(device)
# model.load_state_dict(torch.load(textcnn_ckpt))
# tokenizer = GloveTokenizer(pretrain_path=glove_path, seq_len=300) # GloveTokenizer is used for AttentionLSTM and TextCNN only.

# pretrain = "bert-base-uncased"
# model = BertForClassification(
#     pretrain_dir="bert-base-uncased", output_size=1
# ).to(device)
# model.load_state_dict(torch.load(bert_ckpt))

pretrain = "xlnet-base-cased"
model = XLNetForClassification(
    pretrain_dir="xlnet-base-cased",
    output_size=1
).to(device)
model.load_state_dict(torch.load(xlnet_ckpt))

tokenizer = WordPieceTokenizer(pretrain_dir=pretrain, seq_len=300) # WordPieceTokenizer is used for XLNet and BERT because of the model structure.


100%|██████████| 760/760 [00:00<00:00, 189742.32B/s]
100%|██████████| 467042463/467042463 [01:21<00:00, 5757780.20B/s]


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [26]:
output_dir = Path(os.path.curdir) / "outputs/modified_pred"
final_df = pd.DataFrame()
for i, (df_name, df) in enumerate(load_ceo_datasets(ceo_dir)):
    if len(df) > 0:
        print(df_name)
        try:
            pred_df = make_prediction(df, model, tokenizer, device)
            final_df = final_df.append(pred_df)
            print("{} has been processed".format(df_name))
        except ValueError:
            print("CEO not Found: {}".format(df_name))

final_df.to_csv(output_dir / "{}_output.csv".format(model.__class__.__name__), index=False)

Actions Semiconductor Co., Ltd..csv
CEO not Found: Actions Semiconductor Co., Ltd..csv
Active Power, Inc..csv
Active Power, Inc..csv has been processed
Activision Blizzard, Inc..csv
Activision Blizzard, Inc..csv has been processed
ADC Telecommunications Inc..csv
ADC Telecommunications Inc..csv has been processed
Adecco Group AG.csv
Adecco Group AG.csv has been processed
Adept Technology, Inc..csv
Adept Technology, Inc..csv has been processed
adidas AG.csv
adidas AG.csv has been processed
Adobe Inc..csv
Adobe Inc..csv has been processed
Adtalem Global Education Inc..csv
Adtalem Global Education Inc..csv has been processed
ADTRAN, Inc..csv
ADTRAN, Inc..csv has been processed
Advance America, Cash Advance Centers, Inc..csv
Advance America, Cash Advance Centers, Inc..csv has been processed
Advance Auto Parts, Inc..csv
Advance Auto Parts, Inc..csv has been processed
Advanced Analogic Technologies, Inc..csv
Advanced Analogic Technologies, Inc..csv has been processed
Advanced Energy Industrie

In [None]:
# Sean Boyd - , Vice-Chairman &amp; CEO

# 