In [1]:
import os
import json
import gc
import faiss
from sentence_transformers import SentenceTransformer
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scipy
import scml
from scml import pandasx as pdx
from daigt.preprocess import en as pen
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
min_len, max_len = 773, 32_767  # P01 cutoff

In [3]:
#tokenizer = AutoTokenizer.from_pretrained("huggingface/microsoft/deberta-v3-base", is_fast=True)
#print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

# Combine data sources

In [4]:
edf = pd.read_csv("input/train_essays.csv", low_memory=False)
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1378 non-null   object
 1   prompt_id  1378 non-null   int64 
 2   text       1378 non-null   object
 3   generated  1378 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 43.2+ KB


In [5]:
pdf = pd.read_csv("input/train_prompts.csv", low_memory=False)
pdf = pdf.set_index("prompt_id")
pdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   prompt_name   2 non-null      object
 1   instructions  2 non-null      object
 2   source_text   2 non-null      object
dtypes: object(3)
memory usage: 64.0+ bytes


In [6]:
df = pd.read_csv("input/train_drcat_02.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39785 entries, 0 to 39784
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   essay_id  39785 non-null  object
 1   text      39785 non-null  object
 2   label     39785 non-null  int64 
 3   source    39785 non-null  object
 4   prompt    9490 non-null   object
 5   fold      39785 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.8+ MB


In [7]:
cols = ["essay_id", "text", "label", "source", "prompt"]
df = df[cols]
df = df.rename(columns={"essay_id": "id", "label": "generated"})
df["prompt"] = df["prompt"].fillna("na")

In [8]:
def get_prompt(row)->str:
    p=pdf.loc[row["prompt_id"]]
    name=p["prompt_name"]
    inst=p["instructions"]
    return f"{name} {inst}"
    

edf["prompt"] = edf.progress_apply(get_prompt, axis=1)
edf["source"] = "train_essays"
df = pd.concat([edf, df], ignore_index=True)
cols = ["id", "generated", "source", "prompt", "text"]
df = df[cols].copy()
cols = ["generated"]
df[cols] = df[cols].astype(np.int8)
df.info()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 74833.31it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41163 entries, 0 to 41162
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         41163 non-null  object
 1   generated  41163 non-null  int8  
 2   source     41163 non-null  object
 3   prompt     41163 non-null  object
 4   text       41163 non-null  object
dtypes: int8(1), object(4)
memory usage: 1.3+ MB





In [9]:
more = len(df)
df = df.drop_duplicates(["text"], ignore_index=True)
print(f"{more - len(df)} rows dropped: duplicates")

1429 rows dropped: duplicates


# Preprocess Text

In [10]:
bsc = pen.BasicPreprocessor()
bow = pen.BowPreprocessor()

def preprocess_bsc(col) -> Callable:
    def fn(row) -> str:
        return bsc(row[col])
    
    return fn

def preprocess_bow(col) -> Callable:
    def fn(row) -> str:
        return bow(row[col], drop_stopword=False)
    
    return fn

In [11]:
cols = ["text", "prompt"]
for col in cols:
    print(col)
    new_col = f"{col}_bsc" 
    df[new_col] = df.progress_apply(preprocess_bsc(col), axis=1)
    new_col = f"{col}_bow" 
    df[new_col] = df.progress_apply(preprocess_bow(col), axis=1)
    col1, col2 = new_col, f"{col}_bow_len"
    df[col2] = df[col1].str.len()
    df[col2] = df[col2].astype(np.int16)
    #x = tokenizer(df[new_col].tolist(), truncation=False, add_special_tokens=False)
    #len_col = f"{new_col}_len" 
    #df[len_col] = [len(s) for s in x["input_ids"]]
    #df[len_col] = df[len_col].astype(np.int16) 

text


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39734/39734 [00:10<00:00, 3708.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39734/39734 [16:56<00:00, 39.09it/s]


prompt


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39734/39734 [00:01<00:00, 27437.01it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39734/39734 [00:43<00:00, 915.18it/s]


# Filter Invalid Data

In [12]:
more = len(df)
df = df.drop_duplicates(["text_bow"], ignore_index=True)
print(f"{more - len(df)} rows dropped: duplicates")

219 rows dropped: duplicates


In [13]:
df.describe(percentiles=percentiles)

Unnamed: 0,generated,text_bow_len,prompt_bow_len
count,39515.0,39515.0,39515.0
mean,0.252891,2195.112818,87.917854
std,0.434675,1006.48068,161.0615
min,0.0,234.0,2.0
1%,0.0,773.0,2.0
5%,0.0,951.0,2.0
10%,0.0,1100.0,2.0
20%,0.0,1359.0,2.0
30%,0.0,1582.0,2.0
40%,0.0,1792.0,2.0


In [14]:
more = len(df)
df = df[(df["text_bow_len"]>=min_len) & (df["text_bow_len"]<=max_len)].copy()
print(f"{more - len(df)} rows dropped: duplicates")

395 rows dropped: duplicates


# Train/Validation Split

In [15]:
%%time
model = SentenceTransformer("huggingface/sentence-transformers/all-mpnet-base-v2", device=torch.device("mps"))
model.max_seq_length = 384
index = faiss.read_index("input/white.index")
print(f"ntotal={index.ntotal}, is_trained={index.is_trained}")

ntotal=10015, is_trained=True
CPU times: user 88.9 ms, sys: 410 ms, total: 499 ms
Wall time: 243 ms


In [16]:
%%time
em = model.encode(sentences=df["text_bow"].tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
faiss.normalize_L2(em)
print(f"em={em.shape}")

Batches:   0%|          | 0/153 [00:00<?, ?it/s]

em=(39120, 768)
CPU times: user 32.7 s, sys: 8.47 s, total: 41.2 s
Wall time: 11min 35s


In [17]:
%%time
k=1
distances, ids = index.search(em, k)  

CPU times: user 555 ms, sys: 26.4 ms, total: 582 ms
Wall time: 229 ms


In [18]:
col = "white_sim"
df[col] = distances.flatten().tolist()
df[col] = df[col].astype(np.float32)
df[col].describe(percentiles=percentiles)

count    39120.000000
mean         0.842840
std          0.169336
min          0.169103
1%           0.402919
5%           0.482188
10%          0.544123
20%          0.679715
30%          0.838542
40%          0.879516
50%          0.901562
60%          0.918892
70%          0.940775
80%          1.000000
90%          1.000000
95%          1.000000
99%          1.000001
max          1.000001
Name: white_sim, dtype: float64

# Review Data

In [19]:
df = df.reset_index(drop=True) 
df = df.drop(columns=["id"])
df["essay_id"] = df.index
df["essay_id"] = df["essay_id"].astype(np.int32)
cols = list(df.columns)
cols = [cols[-1]] + cols[:-1]
df = df[cols]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39120 entries, 0 to 39119
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   essay_id        39120 non-null  int32  
 1   generated       39120 non-null  int8   
 2   source          39120 non-null  object 
 3   prompt          39120 non-null  object 
 4   text            39120 non-null  object 
 5   text_bsc        39120 non-null  object 
 6   text_bow        39120 non-null  object 
 7   text_bow_len    39120 non-null  int16  
 8   prompt_bsc      39120 non-null  object 
 9   prompt_bow      39120 non-null  object 
 10  prompt_bow_len  39120 non-null  int16  
 11  white_sim       39120 non-null  float32
dtypes: float32(1), int16(2), int32(1), int8(1), object(7)
memory usage: 2.6+ MB


In [20]:
df.describe(percentiles=percentiles)

Unnamed: 0,essay_id,generated,text_bow_len,prompt_bow_len,white_sim
count,39120.0,39120.0,39120.0,39120.0,39120.0
mean,19559.5,0.247648,2210.626457,86.428119,0.84284
std,11293.115602,0.431652,999.526152,160.514247,0.169336
min,0.0,0.0,773.0,2.0,0.169103
1%,391.19,0.0,836.0,2.0,0.402919
5%,1955.95,0.0,979.0,2.0,0.482188
10%,3911.9,0.0,1124.0,2.0,0.544123
20%,7823.8,0.0,1379.0,2.0,0.679715
30%,11735.7,0.0,1596.0,2.0,0.838542
40%,15647.6,0.0,1804.0,2.0,0.879516


In [21]:
cols=["generated"]
pdx.value_counts(df[cols])

Unnamed: 0_level_0,count,percent
generated,Unnamed: 1_level_1,Unnamed: 2_level_1
0,29432,0.752352
1,9688,0.247648


In [22]:
cols=["prompt_bow"]
pdx.value_counts(df[cols]).head(50)

Unnamed: 0_level_0,count,percent
prompt_bow,Unnamed: 1_level_1,Unnamed: 2_level_1
na,28717,0.734075
car free cities write an explanatory essay to inform fellow citizens about the advantages of limiting car usage your essay must be based on ideas and information that can be found in the passage set manage your time carefully so that you can read the passages plan your response write your response and revise and edit your response be sure to use evidence from multiple sources and avoid overly relying on one source your response should be in the form of a multiparagraph essay write your essay in the space provided,708,0.018098
does the electoral college work write a letter to your state senator in which you argue in favor of keeping the electoral college or changing to election by popular vote for the president of the united states use the information from the texts in your essay manage your time carefully so that you can read the passages plan your response write your response and revise and edit your response be sure to include a claim address counterclaims use evidence from multiple sources and avoid overly relying on one source your response should be in the form of a multiparagraph essay write your response in the space provided,670,0.017127
the legalization of marijuana have been a controversial topic in recent years write an essay arguing for or against the legalization of marijuana providing reasons and examples to support your position,77,0.001968
some schools offer distance learning as an option for students to attend classes from home by way of online or video conferencing do you think students would benefit from being able to attend classes from home take a position on this issue support your response with reasons and examples,77,0.001968
the role of libraries in the digital age have been a topic of discussion write an essay arguing for or against the continued importance of libraries providing reasons and examples to support your position,75,0.001917
write an explanatory essay to inform fellow citizens about the advantages of limiting car usage your essay must be based on ideas and information that can be found in the passage set manage your time carefully so that you can read the passages plan your response write your response and revise and edit your response be sure to use evidence from multiple sources and avoid overly relying on one source your response should be in the form of a multiparagraph essay write your essay in the space provided,73,0.001866
the role of zoos in conservation and education have been a topic of debate argue for or against the continued operation of zoos providing reasons and examples to justify your position,72,0.00184
the use of single use plastic products have become a controversial topic due to its impacts on the environment write an essay arguing for or against the ban of single use plastics providing reasons and examples to support your position,71,0.001815
in the article making mona lisa smile the author describes how a new technology called the facial action coding system enables computers to identify human emotions using details from the article write an essay arguing whether the use of this technology to read the emotional expressions of students in a classroom is valuable,69,0.001764


In [23]:
cols=["source"]
pdx.value_counts(df[cols])

Unnamed: 0_level_0,count,percent
source,Unnamed: 1_level_1,Unnamed: 2_level_1
persuade_corpus,25797,0.659433
llammistral7binstruct,2420,0.061861
llama2_chat,2418,0.06181
original_moth,2260,0.057771
chat_gpt_moth,2124,0.054294
train_essays,1378,0.035225
llama_70b_v1,1172,0.029959
falcon_180b_v1,1051,0.026866
radek_500,500,0.012781


In [29]:
df.groupby(["source"])["white_sim"].agg(["min", "median", "max"])

Unnamed: 0_level_0,min,median,max
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chat_gpt_moth,0.240497,0.573841,0.95658
falcon_180b_v1,0.169103,0.6341,0.963799
llama2_chat,0.297591,0.602314,0.92631
llama_70b_v1,0.350025,0.647478,0.964943
llammistral7binstruct,0.182281,0.599823,0.937744
original_moth,0.257184,0.588107,0.960358
persuade_corpus,0.359044,0.924625,1.000001
radek_500,0.785699,0.904607,0.965763
train_essays,0.697057,0.993068,1.000001


In [35]:
df.groupby(["source", "generated"])["essay_id"].count()

source                 generated
chat_gpt_moth          1             2124
falcon_180b_v1         1             1051
llama2_chat            1             2418
llama_70b_v1           1             1172
llammistral7binstruct  1             2420
original_moth          0             2260
persuade_corpus        0            25797
radek_500              1              500
train_essays           0             1375
                       1                3
Name: essay_id, dtype: int64

In [31]:
df.sample(5).head()

Unnamed: 0,essay_id,generated,source,prompt,text,text_bsc,text_bow,text_bow_len,prompt_bsc,prompt_bow,prompt_bow_len,white_sim
7919,7919,0,persuade_corpus,na,"Dear Principle,\n\nCommunity service should be recommended for all students. It helps them learn about the community that they live in and to help it run smoothly. Like picking up litter would help the earth by saving it from all the waste that humans just throw out. It'll also help animals from dieing because sometimes the stuff humans just throw out can hurt animals and sometimes kill them.\n\nOther kinds of community service is good to. Like giving food to the people that starve and don't have any money to feed there family or give toys to needy children who don't have anything. Doing stuff like that is good to do because it helps other people around them and it also makes the person that does it feel good about themselves.\n\nI highly recommend that all students to contribute in community service whether its picking up litter, doing food drives, donations, tutoring or anything that can help out the community. I assure you its not a waste of time.\n\nThank you,\n\nSTUDENT_NAME","Dear Principle, Community service should be recommended for all students. It helps them learn about the community that they live in and to help it run smoothly. Like picking up litter would help the earth by saving it from all the waste that humans just throw out. It'll also help animals from dieing because sometimes the stuff humans just throw out can hurt animals and sometimes kill them. Other kinds of community service is good to. Like giving food to the people that starve and don't have any money to feed there family or give toys to needy children who don't have anything. Doing stuff like that is good to do because it helps other people around them and it also makes the person that does it feel good about themselves. I highly recommend that all students to contribute in community service whether its picking up litter, doing food drives, donations, tutoring or anything that can help out the community. I assure you its not a waste of time. Thank you, STUDENT_NAME",dear principle community service should be recommended for all students it helps them learn about the community that they live in and to help it run smoothly like picking up litter would help the earth by saving it from all the waste that humans just throw out it will also help animals from dieing because sometimes the stuff humans just throw out can hurt animals and sometimes kill them other kinds of community service is good to like giving food to the people that starve and do not have any money to feed there family or give toys to needy children who do not have anything doing stuff like that is good to do because it helps other people around them and it also makes the person that do it feel good about themselves i highly recommend that all students to contribute in community service whether its picking up litter doing food drives donations tutoring or anything that can help out the community i assure you its not a waste of time thank you student name,967,na,na,2,0.91698
30266,30266,0,persuade_corpus,na,"I agree with the principal, because doing an extracurricular actives can benefit kids in many ways. One way doing an extracurricular activity could benefit kids is by playing a sport it can teach them many things. Another way doing an extracurricular activity could benefit kids is join a student council is can teach kids how to use their voice. Last way is join any club.\n\nA way join a sport could benefit kids is it can show kids what its like to be on a team. That can help them to get a job or if they already have one it can help them during their job. Another way playing a sport can benefit kids it can teach them how to work on a team and how to work well with others.\n\nAnother way how extracurricular act ivies could benefit kids is serving on student council it can help kids speak up or fight for what's or to make things equal. That can benefit kids in many ways, such as if while working a job you think something is not be treated right or if someone is not being treated equally you can speak up for what's right.\n\nLast way joing an extracurricular activity could benfit kids is by joing a a club. Joing a club could help them interact with more people. Learning how to interact with people can help kids later on life, during a job, in school, or even in public.\n\nIn concluison joining an extracurricualr activity could help kids in many more ways then the ones sated above. Playing a sport could benfit kids by teaching them how to work togeter. Serving on a student conciual could help kids use their voice and fight for whats right. Last way is by joing a club it could help kids interact with people. You should join an extracurricular to see what could happen.","I agree with the principal, because doing an extracurricular actives can benefit kids in many ways. One way doing an extracurricular activity could benefit kids is by playing a sport it can teach them many things. Another way doing an extracurricular activity could benefit kids is join a student council is can teach kids how to use their voice. Last way is join any club. A way join a sport could benefit kids is it can show kids what its like to be on a team. That can help them to get a job or if they already have one it can help them during their job. Another way playing a sport can benefit kids it can teach them how to work on a team and how to work well with others. Another way how extracurricular act ivies could benefit kids is serving on student council it can help kids speak up or fight for what's or to make things equal. That can benefit kids in many ways, such as if while working a job you think something is not be treated right or if someone is not being treated equally you can speak up for what's right. Last way joing an extracurricular activity could benfit kids is by joing a a club. Joing a club could help them interact with more people. Learning how to interact with people can help kids later on life, during a job, in school, or even in public. In concluison joining an extracurricualr activity could help kids in many more ways then the ones sated above. Playing a sport could benfit kids by teaching them how to work togeter. Serving on a student conciual could help kids use their voice and fight for whats right. Last way is by joing a club it could help kids interact with people. You should join an extracurricular to see what could happen.",i agree with the principal because doing an extracurricular actives can benefit kids in many ways one way doing an extracurricular activity could benefit kids is by playing a sport it can teach them many things another way doing an extracurricular activity could benefit kids is join a student council is can teach kids how to use their voice last way is join any club a way join a sport could benefit kids is it can show kids what its like to be on a team that can help them to get a job or if they already have one it can help them during their job another way playing a sport can benefit kids it can teach them how to work on a team and how to work well with others another way how extracurricular act ivies could benefit kids is serving on student council it can help kids speak up or fight for what is or to make things equal that can benefit kids in many ways such as if while working a job you think something is not be treated right or if someone is not being treated equally you can speak up for what is right last way joing an extracurricular activity could benfit kids is by joing a a club joing a club could help them interact with more people learning how to interact with people can help kids later on life during a job in school or even in public in concluison joining an extracurricualr activity could help kids in many more ways then the ones sated above playing a sport could benfit kids by teaching them how to work togeter serving on a student conciual could help kids use their voice and fight for what s right last way is by joing a club it could help kids interact with people you should join an extracurricular to see what could happen,1659,na,na,2,0.988591
35064,35064,1,falcon_180b_v1,"The use of single-use plastic products has become a controversial topic due to its impacts on the environment. Write an essay arguing for or against the ban of single-use plastics, providing reasons and examples to support your position.","I believe that the ban of single-use plastics is necessary to reduce the negative impact it has on our environment. Single-use plastics are a major source of pollution and harm to wildlife. These products, such as straws, plastic bags, and water bottles, are used once and then disposed of, leading to a large amount of waste in our oceans and landfills.\n\nOne of the main reasons why single-use plastics should be banned is due to their harmful effects on marine life. Many sea creatures mistake plastic for food or become entangled in it, leading to injury or death. According to the Ocean Conservancy, plastic straws are one of the top ten items found in beach cleanups, and plastic bags are the fifth most common. This shows that these products are not only a problem for marine life but also for our oceans and beaches.\n\nAdditionally, the production and disposal of single-use plastics contribute to climate change. The manufacturing of these products requires the use of non-renewable resources such as oil, and the disposal in landfills produces greenhouse gases. This not only harms the environment but also has negative impacts on human health.\n\nWhile some argue that the convenience of single-use plastics is necessary, alternatives such as reusable straws and bags are readily available and just as convenient. It is our responsibility to prioritize the health of the environment over our own personal convenience.\n\nIn conclusion, the ban of single-use plastics is necessary to protect our environment and wildlife. We cannot continue to ignore the negative impacts of our actions, and banning these products is a small step towards a more sustainable future. It is up to us as individuals and as a society to make the necessary changes to reduce our reliance on single-use plastics and protect our planet.","I believe that the ban of single-use plastics is necessary to reduce the negative impact it has on our environment. Single-use plastics are a major source of pollution and harm to wildlife. These products, such as straws, plastic bags, and water bottles, are used once and then disposed of, leading to a large amount of waste in our oceans and landfills. One of the main reasons why single-use plastics should be banned is due to their harmful effects on marine life. Many sea creatures mistake plastic for food or become entangled in it, leading to injury or death. According to the Ocean Conservancy, plastic straws are one of the top ten items found in beach cleanups, and plastic bags are the fifth most common. This shows that these products are not only a problem for marine life but also for our oceans and beaches. Additionally, the production and disposal of single-use plastics contribute to climate change. The manufacturing of these products requires the use of non-renewable resources such as oil, and the disposal in landfills produces greenhouse gases. This not only harms the environment but also has negative impacts on human health. While some argue that the convenience of single-use plastics is necessary, alternatives such as reusable straws and bags are readily available and just as convenient. It is our responsibility to prioritize the health of the environment over our own personal convenience. In conclusion, the ban of single-use plastics is necessary to protect our environment and wildlife. We cannot continue to ignore the negative impacts of our actions, and banning these products is a small step towards a more sustainable future. It is up to us as individuals and as a society to make the necessary changes to reduce our reliance on single-use plastics and protect our planet.",i believe that the ban of single use plastics is necessary to reduce the negative impact it have on our environment single use plastics are a major source of pollution and harm to wildlife these products such as straws plastic bags and water bottles are used once and then disposed of leading to a large amount of waste in our oceans and landfills one of the main reasons why single use plastics should be banned is due to their harmful effects on marine life many sea creatures mistake plastic for food or become entangled in it leading to injury or death according to the ocean conservancy plastic straws are one of the top ten items found in beach cleanups and plastic bags are the fifth most common this shows that these products are not only a problem for marine life but also for our oceans and beaches additionally the production and disposal of single use plastics contribute to climate change the manufacturing of these products requires the use of non renewable resources such as oil and the disposal in landfills produces greenhouse gases this not only harms the environment but also have negative impacts on human health while some argue that the convenience of single use plastics is necessary alternatives such as reusable straws and bags are readily available and just as convenient it is our responsibility to prioritize the health of the environment over our own personal convenience in conclusion the ban of single use plastics is necessary to protect our environment and wildlife we can not continue to ignore the negative impacts of our actions and banning these products is a small step towards a more sustainable future it is up to us as individuals and as a society to make the necessary changes to reduce our reliance on single use plastics and protect our planet,1787,"The use of single-use plastic products has become a controversial topic due to its impacts on the environment. Write an essay arguing for or against the ban of single-use plastics, providing reasons and examples to support your position.",the use of single use plastic products have become a controversial topic due to its impacts on the environment write an essay arguing for or against the ban of single use plastics providing reasons and examples to support your position,235,0.520439
6549,6549,1,llama2_chat,Task: Analyze why it is important to have an effective strategy before working hard to ensure success. Explore the importance of planning and considering different opinions rather than simply working hard and never giving up.,"Hey, so I gotta write this essay about why it's important to have a strategy before working hard to achieve success. At first, I was like, ""Ugh, do I really gotta?"" But then I was like, ""Wait, this is actually kinda cool!"" So here I am, writing this essay about why strategizing is the bomb.com.\n\nFirst off, let's talk about why working hard is important. I mean, it's like, duh, right? You gotta put in the effort if you want to achieve your goals. But here's the thing: working hard without a strategy is like running a marathon without a map. You might be putting in a lot of effort, but you're not gonna get very far if you don't know where you're going. See what I mean?\n\nThat's why it's important to have a strategy. It's like a roadmap for your success. It helps you figure out what you want to achieve, how you're gonna achieve it, and how you're gonna measure your progress. Without a strategy, you're just kinda winging it, and that's not always gonna lead to success.\n\nBut here's the thing: having a strategy doesn't mean you're gonna succeed automatically. It just means you've got a better chance of success. And that's where considering different opinions comes in. See, when you're making a strategy, you gotta think about what other people might think. You gotta be like, ""Okay, what if this happens? What if that happens?"" And then you can adjust your strategy accordingly. It's like having a backup plan, you know?\n\nSo, in conclusion, having a strategy before working hard is super important. It helps you figure out where you're going, how you're gonna get there, and how you're gonna measure your progress. And considering different opinions helps you make a better strategy that's more likely to lead to success. So, yeah, strategizing is pretty cool. Who knew? 😅","Hey, so I gotta write this essay about why it's important to have a strategy before working hard to achieve success. At first, I was like, ""Ugh, do I really gotta?"" But then I was like, ""Wait, this is actually kinda cool!"" So here I am, writing this essay about why strategizing is the bomb.com. First off, let's talk about why working hard is important. I mean, it's like, duh, right? You gotta put in the effort if you want to achieve your goals. But here's the thing: working hard without a strategy is like running a marathon without a map. You might be putting in a lot of effort, but you're not gonna get very far if you don't know where you're going. See what I mean? That's why it's important to have a strategy. It's like a roadmap for your success. It helps you figure out what you want to achieve, how you're gonna achieve it, and how you're gonna measure your progress. Without a strategy, you're just kinda winging it, and that's not always gonna lead to success. But here's the thing: having a strategy doesn't mean you're gonna succeed automatically. It just means you've got a better chance of success. And that's where considering different opinions comes in. See, when you're making a strategy, you gotta think about what other people might think. You gotta be like, ""Okay, what if this happens? What if that happens?"" And then you can adjust your strategy accordingly. It's like having a backup plan, you know? So, in conclusion, having a strategy before working hard is super important. It helps you figure out where you're going, how you're gonna get there, and how you're gonna measure your progress. And considering different opinions helps you make a better strategy that's more likely to lead to success. So, yeah, strategizing is pretty cool. Who knew?",hey so i got to write this essay about why it is important to have a strategy before working hard to achieve success at first i was like ugh do i really got to but then i was like wait this is actually kind of cool so here i am writing this essay about why strategizing is the bomb com first off let us talk about why working hard is important i mean it is like duh right you got to put in the effort if you want to achieve your goals but here is the thing working hard without a strategy is like running a marathon without a map you might be putting in a lot of effort but you are not going to get very far if you do not know where you are going see what i mean that is why it is important to have a strategy it is like a roadmap for your success it helps you figure out what you want to achieve how you are going to achieve it and how you are going to measure your progress without a strategy you are just kind of winging it and that is not always going to lead to success but here is the thing having a strategy do not mean you are going to succeed automatically it just means you have got a better chance of success and that is where considering different opinions comes in see when you are making a strategy you got to think about what other people might think you got to be like okay what if this happens what if that happens and then you can adjust your strategy accordingly it is like having a backup plan you know so in conclusion having a strategy before working hard is super important it helps you figure out where you are going how you are going to get there and how you are going to measure your progress and considering different opinions helps you make a better strategy that is more likely to lead to success so yeah strategizing is pretty cool who knew,1770,Task: Analyze why it is important to have an effective strategy before working hard to ensure success. Explore the importance of planning and considering different opinions rather than simply working hard and never giving up.,task analyze why it is important to have an effective strategy before working hard to ensure success explore the importance of planning and considering different opinions rather than simply working hard and never giving up,222,0.543792
5755,5755,0,persuade_corpus,na,"In the article ""Making Mona Lisa Smile,"" the author is decribing how a new technology: the Facial Action Coding System, is enabling computers to identify human emotions. The use of this technology: to read the emotional expressions of students, is not valuable. Even though human have simaliar muscles to show the same emotion each one does vary a great significance.\n\nIn paragraph 6 of the article Dr. Huang made a bold prediction ""A classroom computer could reconize when a student is becoming confused or bored,"" This means to him that a students emotions can be completly read by a computer scan. This doesn't seem logical in any means at all. Have you ever watched your classmates facial expression change from confused to them understanding completely in the blink of an eye? Im sure you have. If a computer reads your confused facial emotion in the middle of a lesson right before you understand it, it will change making you have to reset your brain to relearn what you were the the bridge of mastering.\n\nIn paragraph 5 a sentence contradicted the article ""Of course, most of us would have trouble actually describing each facial trait,"" if this is true then why would an artifical intelegence be able to read emotions better than a human who experiences the similar emotions? A real life example is when someone is really good at looking like they don't care. This is a hard face to read because some people just naturally have it at all times. With a piece of simple technology how would the scan be able to look past the same face with the same emotion at all times? It most likely couldnt and would be uneffectful in the end.\n\nIn paragraph 6 another outside of the box thought was put into words from Dr. Huang ""Most human communication is nonverbal, including emotional communication...So computers need to understand that, too,"" The problem with this is that computers don't understand things, they are programmed to do what they are told. Only alive things ""understand"" which means to learn. If we put a program in our computer to read emotions then the computer will most likely only read that exact facial expression.\n\nThe use of technology to read emotional facial expressions of students in a classroom is not valuable because it is to far fetched and unpractical. This is because we have teachers, emotions change constantly, emotions are sometimes hard to read, and computers can't ""understand"" how a human is feeling a all times. Maybe one day a computer will be able to read facial expression, but it will be a waste of time and money.","In the article ""Making Mona Lisa Smile,"" the author is decribing how a new technology: the Facial Action Coding System, is enabling computers to identify human emotions. The use of this technology: to read the emotional expressions of students, is not valuable. Even though human have simaliar muscles to show the same emotion each one does vary a great significance. In paragraph 6 of the article Dr. Huang made a bold prediction ""A classroom computer could reconize when a student is becoming confused or bored,"" This means to him that a students emotions can be completly read by a computer scan. This doesn't seem logical in any means at all. Have you ever watched your classmates facial expression change from confused to them understanding completely in the blink of an eye? Im sure you have. If a computer reads your confused facial emotion in the middle of a lesson right before you understand it, it will change making you have to reset your brain to relearn what you were the the bridge of mastering. In paragraph 5 a sentence contradicted the article ""Of course, most of us would have trouble actually describing each facial trait,"" if this is true then why would an artifical intelegence be able to read emotions better than a human who experiences the similar emotions? A real life example is when someone is really good at looking like they don't care. This is a hard face to read because some people just naturally have it at all times. With a piece of simple technology how would the scan be able to look past the same face with the same emotion at all times? It most likely couldnt and would be uneffectful in the end. In paragraph 6 another outside of the box thought was put into words from Dr. Huang ""Most human communication is nonverbal, including emotional communication...So computers need to understand that, too,"" The problem with this is that computers don't understand things, they are programmed to do what they are told. Only alive things ""understand"" which means to learn. If we put a program in our computer to read emotions then the computer will most likely only read that exact facial expression. The use of technology to read emotional facial expressions of students in a classroom is not valuable because it is to far fetched and unpractical. This is because we have teachers, emotions change constantly, emotions are sometimes hard to read, and computers can't ""understand"" how a human is feeling a all times. Maybe one day a computer will be able to read facial expression, but it will be a waste of time and money.",in the article making mona lisa smile the author is decribing how a new technology the facial action coding system is enabling computers to identify human emotions the use of this technology to read the emotional expressions of students is not valuable even though human have simaliar muscles to show the same emotion each one do vary a great significance in paragraph 6 of the article dr huang made a bold prediction a classroom computer could reconize when a student is becoming confused or bored this means to him that a students emotions can be completly read by a computer scan this do not seem logical in any means at all have you ever watched your classmates facial expression change from confused to them understanding completely in the blink of an eye i m sure you have if a computer reads your confused facial emotion in the middle of a lesson right before you understand it it will change making you have to reset your brain to relearn what you were the the bridge of mastering in paragraph 5 a sentence contradicted the article of course most of us would have trouble actually describing each facial trait if this is true then why would an artifical intelegence be able to read emotions better than a human who experiences the similar emotions a real life example is when someone is really good at looking like they do not care this is a hard face to read because some people just naturally have it at all times with a piece of simple technology how would the scan be able to look past the same face with the same emotion at all times it most likely could nt and would be uneffectful in the end in paragraph 6 another outside of the box thought was put into words from dr huang most human communication is nonverbal including emotional communication so computers need to understand that too the problem with this is that computers do not understand things they are programmed to do what they are told only alive things understand which means to learn if we put a program in our computer to read emotions then the computer will most likely only read that exact facial expression the use of technology to read emotional facial expressions of students in a classroom is not valuable because it is to far fetched and unpractical this is because we have teachers emotions change constantly emotions are sometimes hard to read and computers can not understand how a human is feeling a all times maybe one day a computer will be able to read facial expression but it will be a waste of time and money,2505,na,na,2,0.903675


In [32]:
%%time
df.to_parquet(f"output/preprocess.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 524 ms, sys: 53.3 ms, total: 577 ms
Wall time: 581 ms


In [33]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:38:52.335014
