In [1]:
from pathlib import Path
import os
import sys
import gc
import re
import shutil
import json
import math
import jinja2
from collections import defaultdict
import numpy as np
import pandas as pd
import bitsandbytes
import accelerate
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
import scml
from scml import pandasx as pdx
import lalaes2 as mylib
print(f"accelerate={accelerate.__version__}, bitsandbytes={bitsandbytes.__version__}")

accelerate=0.30.1, bitsandbytes=0.43.1


In [2]:
version = "02"
corpus_key = "persuade"
corpus_map = {
    "comp": Path("input/train.csv"),
    "persuade": Path("input/persuade20/persuade_2.0_human_scores_demo_id_github.csv"),
}
basic = mylib.BasicPreprocessor()
bow = mylib.BowPreprocessor(drop_stopword=True)

In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda:1")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
df = pd.read_csv(corpus_map[corpus_key], low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25996 entries, 0 to 25995
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   essay_id_comp               25996 non-null  object 
 1   full_text                   25996 non-null  object 
 2   holistic_essay_score        25996 non-null  int64  
 3   word_count                  25996 non-null  int64  
 4   prompt_name                 25996 non-null  object 
 5   task                        25996 non-null  object 
 6   assignment                  25996 non-null  object 
 7   source_text                 12875 non-null  object 
 8   gender                      25996 non-null  object 
 9   grade_level                 24828 non-null  float64
 10  ell_status                  24787 non-null  object 
 11  race_ethnicity              25996 non-null  object 
 12  economically_disadvantaged  20759 non-null  object 
 13  student_disability_status   208

In [6]:
if corpus_key=="persuade":
    df = df.rename(columns={
        "essay_id_comp": "essay_id",
        "holistic_essay_score": "score",
        "prompt_name": "topic",
        "assignment": "prompt",
    })
cols = ["score"]
df[cols] = df[cols].astype(np.int8)
cols = ["essay_id", "score", "topic", "full_text"]
df = df[cols].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25996 entries, 0 to 25995
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   25996 non-null  object
 1   score      25996 non-null  int8  
 2   topic      25996 non-null  object
 3   full_text  25996 non-null  object
dtypes: int8(1), object(3)
memory usage: 634.8+ KB


In [7]:
def preprocess_text(fn, col) -> Callable:
    def inner(row) -> str:
        return fn(row[col])
    
    return inner


cols = ["full_text"]
for col in cols:
    print(col)
    df[col] = df.progress_apply(preprocess_text(basic, col), axis=1)
cols = ["topic"]
for col in cols:
    print(col)
    df[col] = df.progress_apply(preprocess_text(bow, col), axis=1)

full_text


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25996/25996 [00:09<00:00, 2759.86it/s]


topic


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25996/25996 [00:13<00:00, 1954.95it/s]


In [8]:
pdx.value_counts(df["topic"])

Unnamed: 0_level_0,count,percent
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
facial action coding system,2167,0.083359
distance learning,2157,0.082974
electoral college work,2046,0.078704
car free cities,1959,0.075358
driverless cars,1886,0.07255
exploring venus,1862,0.071626
summer projects,1750,0.067318
mandatory extracurricular activities,1670,0.064241
cell phones school,1656,0.063702
grades extracurricular activities,1626,0.062548


# Review Data

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25996 entries, 0 to 25995
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   25996 non-null  object
 1   score      25996 non-null  int8  
 2   topic      25996 non-null  object
 3   full_text  25996 non-null  object
dtypes: int8(1), object(3)
memory usage: 634.8+ KB


In [10]:
df.head()

Unnamed: 0,essay_id,score,topic,full_text
0,423A1CA112E2,3,phones driving,"Phones Modern humans today are always on their phone. They are always on their phone more than 5 hours a day no stop .All they do is text back and forward and just have group Chats on social media. They even do it while driving. They are some really bad consequences when stuff happens when it comes to a phone. Some certain areas in the United States ban phones from class rooms just because of it. When people have phones, they know about certain apps that they have .Apps like Facebook Twitter Instagram and Snapchat. So like if a friend moves away and you want to be in contact you can still be in contact by posting videos or text messages. People always have different ways how to communicate with a phone. Phones have changed due to our generation. Driving is one of the way how to get around. People always be on their phones while doing it. Which can cause serious Problems. That's why there's a thing that's called no texting while driving. That's a really important thing to remember. Some people still do it because they think It's stupid. No matter what they do they still have to obey it because that's the only way how did he save. Sometimes on the news there is either an accident or a suicide. It might involve someone not looking where they're going or tweet that someone sent. It either injury or death. If a mysterious number says I'm going to kill you and they know where you live but you don't know the person's contact ,It makes you puzzled and make you start to freak out. Which can end up really badly. Phones are fine to use and it's also the best way to come over help. If you go through a problem and you can't find help you ,always have a phone there with you. Even though phones are used almost every day as long as you're safe it would come into use if you get into trouble. Make sure you do not be like this phone while you're in the middle of driving. The news always updated when people do something stupid around that involves their phones. The safest way is the best way to stay safe."
1,BC75783F96E3,4,phones driving,"This essay will explain if drivers should or should not be able to use electronic devices while operating a vehicle. Using a phone while driving can cause very bad wrecks, because they are putting themselves and other people at risk. People should not be able to use cell phones while operating a vehicle is because bad wrecks, putting others' lives in danger, and may cause death. First of all, wrecks can be caused by looking down at your phone while driving. Most importantly, you should always keep your eyes directly on the road when behind the wheel of a car. On that note, you will have an outrageously car payment just because you were looking down at your phone. Moreover, if you the reason the accident happened the person that was operating the other car can sue you for lots of money, and you will have to pay it. Therefore, if you don't pay whatever the person charges you consequences will be served and that's behind bars. Another reason why we should not be able to use cell phones while operating an automobile is that you are putting other people's lives in danger. Thus, looking down at your cell phone you can get someone else hurt for the uncommunicative act you committed. As well as unconsciousness, injuries, or being in a hospital. The main key to not texting and driving while behind the wheel of a car. Mainly, it can cause tragic, terrifying, and horrific things to you and to others and that's death. That's the most important thing to not using electronic devices while operating a vehicle. With the intension, of you not keeping your eyes and staying focus on the road you can easily turn off the road or hit another victim. Possibly another humankind could go away of all flesh because of your stupidity. Must be remembered, always stay focus on the road so you can get to your destination safely. So, you can't cause accidents, put other people lives in danger, and possibly death. Important to realize, do not use your phone while operating a vehicle. At least, wait till you make a safe stop or until you arrive at your destination."
2,74C8BC7417DE,2,phones driving,"Driving while the use of cellular devices Today, most of the society is thoughtless. Especially new drivers, all driver for that matter do not understand the dangers of looking at a cell phone while behind the wheel. The automobile crashes due to distracted driving are increasing, teens have been the largest age group that was reported to being distracted by cellular usage. I do not agree with people using their cell phones and driving, humans should be smarter to know that if something is going on while they are driving that is so important then pulling off to the side of the road. The society doesn't understand that teens are the main cause for accidents I mean, in two thousand and five there was three hundred and one thousand injuries that were caused in distracted driving related accidents, that doesn't necessarily mean that the driver was texting or using a cellular device. New drivers do not understand that deaths are usually results of an accident. one out of every four car accidents is caused by texting and driving."
3,A8445CABFECE,3,phones driving,"Phones & Driving Drivers should not be able to use phones while operating a vehicle. Drivers who used their phone while operating a vehicle are most likely to get into an accident that could be fatal. According to an article by the Edgar Snyder Firm, 21% of teens that were part of a fatal car accident was due to phones. According to the same article, 35% know the risk but continue using their phones while on the road. This shows that its beyond dangerous and irresponsible of drivers not to be fully aware of their surroundings while driving. Drivers should be able to concentrate without any distractions, because it could be fatal. According to another article, ""Distracted Driving"" by the NHTSA, there has already been about 3,000 phone related car accident deaths since 2017. The article states that teen get too distracted with their phones, which causes their accident. Accidents that can be easily avoided by focusing on the road and not a phone. Drivers should not be able to use their phones at all while driving. In conclusion, drivers should not able to work a vehicle while using their cell phone. Drivers who uses their phones while operating a vehicle and are likely to have an accident then those who don't."
4,6B4F7A0165B9,4,phones driving,"Cell Phone Operation While Driving The ability to stay connected to people we know despite distance was originally brought to fruition by the use of letters. This system was found to be rather slow and new pathways were searched for until the invention of the telegram; the people thought it was an invention of the millennia, but after time it too was thought of as slow until the invention of the telephone. Today, a telephone is in the hand or pocket of a majority of the seven billion people on planet earth. However, this device is taken to areas that it is irresponsible and dangerous. Within a vehicle capable of traveling upwards of one hundred miles per hour any possible distraction can become fatal spontaneously. The most common of these distractions is a cell phone, with its capabilities to connect us to anyone also in ownership of one, it is easy to pick it up whenever it sounds. In that split second of reaching over for a phone, eyes no longer on the road, it is impossible to no an exact location of anything, making an extremely dangerous action. For the myriad of possibilities that lead to serious injury cell phones should stay in the current state they are in regards of the law, but taken as a more serious offense. Conversely people may believe that laws in present need to change, becoming less restrictive. People have the right to communicate with whom they wish, when or wherever they may choose to do so. The problem becomes apparent that this is a selfish process of thought; people aren't thinking of those they share the road with. Laws currently in place are not to punish people making poor choices, they are an attempt to keep people safe. The creation of telecommunication devices was to keep connected to others without regard to the obsession that would encompass the human mind that was bound to follow. The safety of people is top priority without exemption."


In [11]:
df.to_parquet(f"output/{corpus_key}_{version}.parquet", index=False)
assert df.notna().all(axis=None)

In [12]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:23.875381
