In [1]:
import gc
import os
import pathlib
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable, Any
import scml
from scml import pandasx as pdx
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
sentences = [
    "Today the majority of humans own and operate cell phones on a daily basis. In essay form, explain if drivers should or should not be able to use cell phones in any capacity while operating a vehicle.",
    "Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.",
    "Some schools require students to complete summer projects to assure they continue learning during their break. Should these summer projects be teacher-designed or student-designed? Take a position on this question. Support your response with reasons and specific examples.",
    "You have just read the article, 'A Cowboy Who Rode the Waves.' Luke's participation in the Seagoing Cowboys program allowed him to experience adventures and visit many unique places. Using information from the article, write an argument from Luke's point of view convincing others to participate in the Seagoing Cowboys program. Be sure to include: reasons to join the program; details from the article to support Luke's claims; an introduction, a body, and a conclusion to your essay.",
    "Your principal has decided that all students must participate in at least one extracurricular activity. For example, students could participate in sports, work on the yearbook, or serve on the student council. Do you agree or disagree with this decision? Use specific details and examples to convince others to support your position.",
    "In 'The Challenge of Exploring Venus,' the author suggests studying Venus is a worthy pursuit despite the dangers it presents. Using details from the article, write an essay evaluating how well the author supports this idea. Be sure to include: a claim that evaluates how well the author supports the idea that studying Venus is a worthy pursuit despite the dangers; an explanation of the evidence from the article that supports your claim; an introduction, a body, and a conclusion to your essay.",
    "In the article 'Making Mona Lisa Smile,' the author describes how a new technology called the Facial Action Coding System enables computers to identify human emotions. Using details from the article, write an essay arguing whether the use of this technology to read the emotional expressions of students in a classroom is valuable.",
    "You have read the article 'Unmasking the Face on Mars.' Imagine you are a scientist at NASA discussing the Face with someone who thinks it was created by aliens. Using information in the article, write an argumentative essay to convince someone that the Face is just a natural landform.Be sure to include: claims to support your argument that the Face is a natural landform; evidence from the article to support your claims; an introduction, a body, and a conclusion to your argumentative essay.",
    "Some of your friends perform community service. For example, some tutor elementary school children and others clean up litter. They think helping the community is very important. But other friends of yours think community service takes too much time away from what they need or want to do. Your principal is deciding whether to require all students to perform community service. Write a letter to your principal in which you take a position on whether students should be required to perform community service. Support your position with examples.",
    "Your principal is considering changing school policy so that students may not participate in sports or other activities unless they have at least a grade B average. Many students have a grade C average. She would like to hear the students' views on this possible policy change. Write a letter to your principal arguing for or against requiring at least a grade B average to participate in sports or other activities. Be sure to support your arguments with specific reasons.",
    "In the article 'Driverless Cars are Coming,' the author presents both positive and negative aspects of driverless cars. Using details from the article, create an argument for or against the development of these cars. Be sure to include: your position on driverless cars; appropriate details from the article that support your position; an introduction, a body, and a conclusion to your argumentative essay.",
    "Write a letter to your state senator in which you argue in favor of keeping the Electoral College or changing to election by popular vote for the president of the United States. Use the information from the texts in your essay. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to include a claim; address counterclaims; use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your response in the space provided.",
    "Your principal is reconsidering the school's cell phone policy. She is considering two possible policies: Policy 1: Allow students to bring phones to school and use them during lunch periods and other free times, as long as the phones are turned off during class time. Policy 2: Do not allow students to have phones at school at all. Write a letter to your principal convincing her which policy you believe is better. Support your position with specific reasons.",
    "Some schools offer distance learning as an option for students to attend classes from home by way of online or video conferencing. Do you think students would benefit from being able to attend classes from home? Take a position on this issue. Support your response with reasons and examples.",
    "When people ask for advice, they sometimes talk to more than one person. Explain why seeking multiple opinions can help someone make a better choice. Use specific details and examples in your response.",
]

In [3]:
df = pd.read_parquet("input/preprocess.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   row_id          221612 non-null  int32 
 1   essay_id        221612 non-null  object
 2   generated       221612 non-null  int8  
 3   source          221612 non-null  object
 4   prompt          221612 non-null  object
 5   text            221612 non-null  object
 6   text_bsc        221612 non-null  object
 7   text_bow        221612 non-null  object
 8   text_bow_len    221612 non-null  int16 
 9   prompt_bsc      221612 non-null  object
 10  prompt_bow      221612 non-null  object
 11  prompt_bow_len  221612 non-null  int16 
dtypes: int16(2), int32(1), int8(1), object(8)
memory usage: 15.4+ MB


In [4]:
pdx.value_counts(df["source"])

Unnamed: 0_level_0,count,percent
source,Unnamed: 1_level_1,Unnamed: 2_level_1
qwedsacf/ivypanda-essays,124725,0.562808
persuade_corpus,25793,0.116388
nid989/EssayFroum-Dataset,23914,0.107909
dim/essayforum_raw_writing_10k,18314,0.08264
sakibsh-llm-human-1,2459,0.011096
mistral7binstruct_v1,2420,0.01092
llama2_chat,2418,0.010911
mistral7binstruct_v2,2409,0.01087
original_moth,2264,0.010216
chat_gpt_moth,2124,0.009584


In [5]:
white = ["persuade_corpus", "train_essays"]
min_len = 1394  # P20
tmp = df[(df["generated"]==0) & (df["source"].isin(white)) & (df["text_bow_len"]>=min_len)]
total = len(tmp)
#tmp = tmp.sample(16_000)
sentences += tmp["text_bow"].tolist()
print(f"{len(sentences):,} sentences")

21,213 sentences


# Build Index

In [6]:
%%time
model = SentenceTransformer("huggingface/sentence-transformers/all-mpnet-base-v2", device=torch.device("mps"))
model.max_seq_length = 384
em = model.encode(sentences=sentences, batch_size=512, show_progress_bar=True, convert_to_numpy=True)
print(f"em={em.shape}")

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

em=(21213, 768)
CPU times: user 18 s, sys: 3.32 s, total: 21.3 s
Wall time: 6min 19s


In [7]:
faiss.normalize_L2(em)

In [8]:
%%time
d = em.shape[1]
index = faiss.IndexFlatIP(d)  # InnerProduct
index.verbose = True
index.train(em)
index.add(em)

CPU times: user 2.81 ms, sys: 7.56 ms, total: 10.4 ms
Wall time: 9.79 ms


In [9]:
%%time
faiss.write_index(index, "output/white.index")

CPU times: user 257 µs, sys: 6.91 ms, total: 7.17 ms
Wall time: 9.02 ms


# Sanity Check

In [10]:
%%time
index = faiss.read_index("output/white.index")
print(f"ntotal={index.ntotal}, is_trained={index.is_trained}")

ntotal=21213, is_trained=True
CPU times: user 1.46 ms, sys: 10.4 ms, total: 11.8 ms
Wall time: 10.9 ms


In [11]:
k=1
distances, ids = index.search(em[:20], k)  
print(f"I={repr(ids)}\nD={repr(distances)}")

I=array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13],
       [14],
       [15],
       [16],
       [17],
       [18],
       [19]])
D=array([[1.0000002 ],
       [0.99999994],
       [1.0000006 ],
       [0.99999994],
       [1.0000002 ],
       [1.0000001 ],
       [1.0000004 ],
       [1.0000005 ],
       [1.0000002 ],
       [1.0000001 ],
       [1.        ],
       [1.0000007 ],
       [1.        ],
       [1.        ],
       [1.0000002 ],
       [1.0000005 ],
       [1.        ],
       [1.        ],
       [0.9999999 ],
       [0.9999997 ]], dtype=float32)


In [12]:
queries = ["phone", "mars", "venus", "school projects", "school sports", "school activities", "cowboy", "facial emotion", "planets", "community service",
          "cars", "automobile", "tesla", "school policy", "politician", "government", "activist", "asking for advice", "distance learning", "learning outside the classroom",
           "baking class", 
          ]
q = model.encode(sentences=queries, batch_size=512, show_progress_bar=True, convert_to_numpy=True)
print(f"q={q.shape}")
faiss.normalize_L2(q)
k=1
distances, ids = index.search(q, k)
distances = distances.flatten()
ids = ids.flatten()
scores = list(zip(distances, queries))
scores.sort()
print(scores)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

q=(21, 768)
[(0.30992886, 'baking class'), (0.32608566, 'politician'), (0.3462901, 'activist'), (0.3492005, 'government'), (0.37505153, 'school projects'), (0.46297583, 'cowboy'), (0.48327643, 'cars'), (0.50360274, 'automobile'), (0.5079346, 'planets'), (0.5115985, 'tesla'), (0.52337724, 'mars'), (0.53983575, 'phone'), (0.55978537, 'community service'), (0.5970917, 'school sports'), (0.611063, 'learning outside the classroom'), (0.66486305, 'distance learning'), (0.6751362, 'school policy'), (0.6792052, 'venus'), (0.6867367, 'school activities'), (0.69434464, 'facial emotion'), (0.7366042, 'asking for advice')]


# Train/Validation Split

In [13]:
%%time
em = model.encode(sentences=df["text_bow"].tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
faiss.normalize_L2(em)
print(f"em={em.shape}")

Batches:   0%|          | 0/433 [00:00<?, ?it/s]

em=(221612, 768)
CPU times: user 5min 59s, sys: 2min 4s, total: 8min 3s
Wall time: 1h 11min 51s


In [14]:
%%time
k=1
distances, ids = index.search(em, k)  
col = "white_sim"
df[col] = distances.flatten().tolist()
df[col] = df[col].astype(np.float32)
df[col].describe(percentiles=percentiles)

CPU times: user 6.62 s, sys: 268 ms, total: 6.89 s
Wall time: 2.69 s


count    221612.000000
mean          0.523406
std           0.209795
min           0.105735
1%            0.232241
5%            0.292006
10%           0.322312
20%           0.361979
30%           0.394710
40%           0.427943
50%           0.464264
60%           0.504922
70%           0.551792
80%           0.627514
90%           0.939899
95%           1.000000
99%           1.000000
max           1.000001
Name: white_sim, dtype: float64

# Review Data

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   row_id          221612 non-null  int32  
 1   essay_id        221612 non-null  object 
 2   generated       221612 non-null  int8   
 3   source          221612 non-null  object 
 4   prompt          221612 non-null  object 
 5   text            221612 non-null  object 
 6   text_bsc        221612 non-null  object 
 7   text_bow        221612 non-null  object 
 8   text_bow_len    221612 non-null  int16  
 9   prompt_bsc      221612 non-null  object 
 10  prompt_bow      221612 non-null  object 
 11  prompt_bow_len  221612 non-null  int16  
 12  white_sim       221612 non-null  float32
dtypes: float32(1), int16(2), int32(1), int8(1), object(8)
memory usage: 16.3+ MB


In [16]:
df.groupby(["source"])["white_sim"].agg(["min", "median", "max"])

Unnamed: 0_level_0,min,median,max
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ChristophSchuhmann/essays-with-instructions,0.171455,0.39558,0.80927
argu-gpt-3.5-turbo,0.339927,0.548457,0.876333
argu-gpt2-xl,0.280246,0.528273,0.825548
argu-text-babbage-001,0.2819,0.529825,0.825246
argu-text-curie-001,0.239317,0.537127,0.847615
argu-text-davinci-001,0.251494,0.524359,0.865054
argu-text-davinci-002,0.27519,0.529928,0.897706
argu-text-davinci-003,0.314692,0.53824,0.864422
chat_gpt_moth,0.242149,0.602758,0.956646
darragh_claude_v6,0.752669,0.913358,0.97417


In [17]:
df.groupby(["generated"])["white_sim"].agg(["min", "median", "max"])

Unnamed: 0_level_0,min,median,max
generated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.105735,0.451824,1.000001
1,0.184968,0.625994,1.000001


In [18]:
%%time
df.to_parquet(f"output/white.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 8.35 s, sys: 1.64 s, total: 9.99 s
Wall time: 12.5 s


In [19]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 1:18:28.928694
