In [1]:
from pathlib import Path
import os
import sys
import gc
import re
import shutil
import json
import math
import jinja2
from collections import defaultdict
import numpy as np
import pandas as pd
import bitsandbytes
import accelerate
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
import scml
from scml import pandasx as pdx
import lalaes2 as mylib
print(f"accelerate={accelerate.__version__}, bitsandbytes={bitsandbytes.__version__}")

accelerate=0.30.1, bitsandbytes=0.43.1


In [2]:
version = "02"
corpus_key = "comp"
corpus_map = {
    "comp": Path("input/train.csv"),
    "persuade": Path("input/persuade20/persuade_2.0_human_scores_demo_id_github.csv"),
}
# model for pseudo labelling Persuade 2.0 topic
directory = Path("models/persuade/deberta_v3_base/20240626_132806")
model_max_length = 512
basic = mylib.BasicPreprocessor()
bow = mylib.BowPreprocessor(drop_stopword=True)

In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda:1")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
df = pd.read_csv(corpus_map[corpus_key], low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


In [6]:
if corpus_key=="persuade":
    df = df.rename(columns={
        "essay_id_comp": "essay_id",
        "holistic_essay_score": "score",
        "prompt_name": "topic",
        "assignment": "prompt",
    })
cols = ["score"]
df[cols] = df[cols].astype(np.int8)

In [7]:
def preprocess_text(fn, col) -> Callable:
    def inner(row) -> str:
        return fn(row[col])
    
    return inner


cols = ["full_text"]
for col in cols:
    print(col)
    df[col] = df.progress_apply(preprocess_text(basic, col), axis=1)

full_text


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17307/17307 [00:05<00:00, 3029.10it/s]


# Pseudo Labelling Topic

Classify text to 1 of 15 topics from Persuade 2.0 corpus.

In [8]:
if corpus_key=="comp":
    tokenizer = AutoTokenizer.from_pretrained(directory, model_max_length=model_max_length)
    print(f"""{tokenizer}
    model_input_names={tokenizer.model_input_names}
    pad_token_id={tokenizer.pad_token_id}
    """)

DebertaV2TokenizerFast(name_or_path='models/persuade/deberta_v3_base/20240626_132806', vocab_size=128000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
    model_input_names=['input_ids', 'token_type_ids



In [9]:
if corpus_key=="comp":
    model = AutoModelForSequenceClassification.from_pretrained(directory)
    print(model)
    print(model.config)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [10]:
if corpus_key=="comp":
    y_pred = np.argmax(
        mylib.classify_persuade_topic_logits(
            ds=mylib.PersuadeDataset(
                tokenizer=tokenizer,
                texts=df["full_text"].tolist(),
            ),
            model=model,
            batch_size=128,
            device=device,
            progress_bar=True,
        ),
        axis=1,
    ).flatten()
    df["topic"] = [mylib.PersuadeDataset.ID_TO_LABEL[i] for i in y_pred]

persuade topic classification: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 136/136 [03:33<00:00,  1.57s/it]


In [11]:
cols = ["topic"]
for col in cols:
    print(col)
    df[col] = df.progress_apply(preprocess_text(bow, col), axis=1)

topic


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17307/17307 [00:08<00:00, 2071.00it/s]


In [12]:
pdx.value_counts(df["topic"])

Unnamed: 0_level_0,count,percent
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
driverless cars,3499,0.202173
facial action coding system,3043,0.175825
exploring venus,3015,0.174207
face mars,2094,0.120992
electoral college work,2046,0.118218
car free cities,1961,0.113307
cowboy rode waves,1648,0.095222
mandatory extracurricular activities,1,5.8e-05


# Review Data

In [13]:
cols = ["essay_id", "score", "topic", "full_text"]
df = df[cols]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   score      17307 non-null  int8  
 2   topic      17307 non-null  object
 3   full_text  17307 non-null  object
dtypes: int8(1), object(3)
memory usage: 422.7+ KB


In [14]:
df.head()

Unnamed: 0,essay_id,score,topic,full_text
0,000d118,3,car free cities,"Many people have car where they live. The thing they don't know is that when you use a car alot of thing can happen like you can get in accidet or the smoke that the car has is bad to breath on if someone is walk but in VAUBAN,Germany they dont have that proble because 70 percent of vauban's families do not own cars,and 57 percent sold a car to move there. Street parkig ,driveways and home garages are forbidden on the outskirts of freiburd that near the French and Swiss borders. You probaly won't see a car in Vauban's streets because they are completely ""car free"" but If some that lives in VAUBAN that owns a car ownership is allowed,but there are only two places that you can park a large garages at the edge of the development,where a car owner buys a space but it not cheap to buy one they sell the space for you car for $40,000 along with a home. The vauban people completed this in 2006 ,they said that this an example of a growing trend in Europe,The untile states and some where else are suburban life from auto use this is called ""smart planning."" The current efforts to drastically reduce greenhouse gas emissions from tailes the passengee cars are responsible for 12 percent of greenhouse gas emissions in Europe and up to 50 percent in some car intensive in the United States. I honeslty think that good idea that they did that is Vaudan because that makes cities denser and better for walking and in VAUBAN there are 5,500 residents within a rectangular square mile. In the artical David Gold berg said that ""All of our development since World war 2 has been centered on the cars,and that will have to change"" and i think that was very true what David Gold said because alot thing we need cars to do we can go anyway were with out cars beacuse some people are a very lazy to walk to place thats why they alot of people use car and i think that it was a good idea that that they did that in VAUBAN so people can see how we really don't need car to go to place from place because we can walk from were we need to go or we can ride bycles with out the use of a car. It good that they are doing that if you thik about your help the earth in way and thats a very good thing to. In the United states ,the Environmental protection Agency is promoting what is called ""car reduced""communtunties,and the legislators are starting to act,if cautiously. Maany experts expect pubic transport serving suburbs to play a much larger role in a new six years federal transportation bill to approved this year. In previous bill,80 percent of appropriations have by law gone to highways and only 20 percent to other transports. There many good reason why they should do this."
1,000fe60,3,face mars,"I am a scientist at NASA that is discussing the ""face"" on mars. I will be explaining how the ""face"" is a land form. By sharing my information about this isue i will tell you just that. First off, how could it be a martions drawing. There is no plant life on mars as of rite now that we know of, which means so far as we know it is not possible for any type of life. That explains how it could not be made by martians. Also why and how would a martion build a face so big. It just does not make any since that a martian did this. Next, why it is a landform. There are many landforms that are weird here in America, and there is also landforms all around the whole Earth. Many of them look like something we can relate to like a snake a turtle a human... So if there are landforms on earth dont you think landforms are on mars to? Of course! why not? It's just unique that the landform on Mars looks like a human face. Also if there was martians and they were trying to get our attention dont you think we would have saw one by now? Finaly, why you should listen to me. You should listen to me because i am a member of NASA and i've been dealing with all of this stuff that were talking about and people who say martians did this have no relation with NASA and have never worked with anything to relate to this landform. One last thing is that everyone working at NASA says the same thing i say, that the ""face"" is just a landform. To sum all this up the ""face"" on mars is a landform but others would like to beleive it's a martian sculpture. Which every one that works at NASA says it's a landform and they are all the ones working on the planet and taking pictures."
2,001ab80,4,driverless cars,"People always wish they had the same technology that they have seen in movies, or the best new piece of technology that is all over social media. However, nobody seems to think of the risks that these kinds of new technologies may have. Cars have been around for many decades, and now manufacturers are starting to get on the bandwagon and come up with the new and improved technology that they hope will appeal to everyone. As of right now, it seems as though the negative characteristics of these cars consume the positive idea that these manufacturers have tried to convey. Currently, this new technology in cars has a very long way to go before being completely ""driverless."" Drivers still need to be on alert when they are driving, as well as control the car near any accidents or complicated traffic situations. This seems to totally defeat the purpose of the ""driverless"" car. Eventually the technology may improve, but nobody can be certain that the driverless car will eventually become completely ""driverless."" This idea just seems like a lot of hard work and money for something that is not very neccessary. If someone does not want to drive their car they can just take a city bus or a subway. There are so many options of transportation that can already solve this problem. Even if masnufacturers are trying to make driving more ""fun,"" driving is not meant to be ""fun"" it is meant to get people where they need to go. Playing around in a car just to have ""fun"" is just a recipe for disaster. The idea of the driverless car also raises many questions about who will be liable when someone gets into an accident in one of these new cars. Many states do not even let people drive semi-automatic cars because there are not even laws that pertain to the liability of anyone who get into an accident while driving these type of cars. If these cars become more popular, states may pass new laws. However, this topic also raises questions about who is able to dictate whether or not it was the car or the human's fault for an accident. Since this technology is so new, there could be many problems with the car's system that nobody has even discovered since they have not drove the car themselves. If someone test drives this kind of car or even purchases one and they get into a crash not knowing what could possibly happen to them, they will want to sue the car manufacturer since they were not aware of any bugs in the car's system. These lawsuits can add up and eventually the manufactuers will be in a bunch of debt, which could cost them their whole idea of the driverless car. The technology car manufacturers are trying to develope may just be a diasaster in the making. There are many alternative options of transportations if you do not feel like driving yourself, and these options are way less expensive than buying a brand new car. Although this technology is relatively new, we can not be certain that this new idea will even pay off in the end, it may just be a waste of money and time. Sometimes the newest technology is not the most benefical."
3,001bdc0,4,exploring venus,"We all heard about Venus, the planet without almost oxygen with earthquakes, erupting volcanoes and temperatures average over 800 degrees Fahrenheit but what if scientist project the futur into this planet ? Through this article, the author uses evidences appealing to reason and concession to make us realize why we should care about studying this planet so that people must give a chance to Venus. Venus is the closest planet to Earth in terms density and size but has a really different climate. As it is evoked by the author: (3) ""A thick atmosphere of almost 97 percent carbon dioxide blankets Venus. Even more challenging are the clouds of highly corrosive sulfuric acid in Venus's atmosphere. On the planet's surface, temperatures average over 800 degrees Fahrenheit....Beyond high pressure and heat, Venusian geology and weather present additional impediments like erupting volcanoes, powerful earthquakes, and frequent lightning strikes to probes seeking to land on its surface."" The author uses of concession tell us what is the actual climate of Venus. He tell us the truth, he support the fact that studying Venus is a worthy pursuit but he also recognize that there's a danger present continuously. And because he support the idea of studying Venus, he proposes ways to study the planet and find a solution at a reasonable distance. Venus is an inhospitable planet that can almost be considered as an impossible dwelling for humans. But for the author, that can't keep scientist to do their job (studying Venus): (5) ""NASA's possible solution to the hostile conditions on the surface of Venus would allow scientists to float above the fray. Imagine a blimp-like vehicle hovering 30 or so miles above the roiling Venusian landscape. Just as our jet airplanes travel at a higher altitude to fly over many storms, a vehicle hovering over Venus would avoid the unfriendly ground conditions by staying up and out of their way. At thirty-plus miles above the surface, temperatures would still be toasty at around 170 degrees Fahrenheit, but the air pressure would be close to that of sea level on Earth."" Through this evidence, appealing to reasonning, the author talk about the NASA's possible solution to raise his credibility. But also, to convince the audience that we can only find a solution by studying the planet. In conclusion, despite of Venus hostility put in advance by the concession, the author makes the audience realize that there's a solution but that we can find it only if we study the planet. He make us find out that challenge and curiosity is part of human life. But also that danger and fear should not stop us from discovering new things. After all, we are Humans."
4,002ba53,3,electoral college work,"Dear, State Senator This is a letter to argue in favor of keeping the Electoral College.""There are many reasons to keep the Electoral College"" one reason is because it is widely regarded as an anachronism, a dispute over the outcome of an Electoral College vote is possible, but it is less likely than a dispute over the popular vote, and the Electoral College restores some of the weight in the political balance that large states (by population) lose by virue of the mal apportionment of the Senate decreed in the Constitution. I am in favor of keeping the Electoral College because,it is widely regarded as an anachronism. A non-democratic method of selecting a president that ought to be [overruled] by declaring the canaditdate who receives the most populare votes the winner. The advocates of this position are correct in arguing that the Electoral College method is not democratic in a method sense.It is the electors who elect the the president ,not the people. But each party selects a slate of electors trusted to vote for the party's nominee (and that trust is rarely betrayed). Another, reason I am in favor of keeping the Electoral College is because, a dispute over the outcome of an Electoral College vote is possible. But it is less likely than a dispute over the popular vote. But it is less likely than a dispute over the popular vote. The reason is that the winning canadate's share of the Electoral College invariably exceeds his share of the popular vote. Last but not least, I am in favor of keeping the Electoral College is because, the Electoral College restores some of the weight in the political balance that large states (by population) lose by virue of the mal apportionment of the Senate decreed in the Constitution. A larger state gets more attintion from presidential canadidates in a campaign than a small state does. It can be argued that Electoral College methods of selecting the president may turn off potential voters for a canadidates who has no hope of carrying their state. But of course no voter's vote swings a national election, and in spite of that, about 1/2 the eligible American population did vote in the [2012's] election. From, PROPER_NAME"


In [15]:
df.to_parquet(f"output/{corpus_key}_{version}.parquet", index=False)
assert df.notna().all(axis=None)

In [16]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:03:55.130938
