In [1]:
import gc
import os
import pathlib
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable, Any
import scml
from scml import pandasx as pdx
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [2]:
device=torch.device('cpu')
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda', 0)

In [3]:
sentences = [
    "Today the majority of humans own and operate cell phones on a daily basis. In essay form, explain if drivers should or should not be able to use cell phones in any capacity while operating a vehicle.",
    "Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.",
    "Some schools require students to complete summer projects to assure they continue learning during their break. Should these summer projects be teacher-designed or student-designed? Take a position on this question. Support your response with reasons and specific examples.",
    "You have just read the article, 'A Cowboy Who Rode the Waves.' Luke's participation in the Seagoing Cowboys program allowed him to experience adventures and visit many unique places. Using information from the article, write an argument from Luke's point of view convincing others to participate in the Seagoing Cowboys program. Be sure to include: reasons to join the program; details from the article to support Luke's claims; an introduction, a body, and a conclusion to your essay.",
    "Your principal has decided that all students must participate in at least one extracurricular activity. For example, students could participate in sports, work on the yearbook, or serve on the student council. Do you agree or disagree with this decision? Use specific details and examples to convince others to support your position.",
    "In 'The Challenge of Exploring Venus,' the author suggests studying Venus is a worthy pursuit despite the dangers it presents. Using details from the article, write an essay evaluating how well the author supports this idea. Be sure to include: a claim that evaluates how well the author supports the idea that studying Venus is a worthy pursuit despite the dangers; an explanation of the evidence from the article that supports your claim; an introduction, a body, and a conclusion to your essay.",
    "In the article 'Making Mona Lisa Smile,' the author describes how a new technology called the Facial Action Coding System enables computers to identify human emotions. Using details from the article, write an essay arguing whether the use of this technology to read the emotional expressions of students in a classroom is valuable.",
    "You have read the article 'Unmasking the Face on Mars.' Imagine you are a scientist at NASA discussing the Face with someone who thinks it was created by aliens. Using information in the article, write an argumentative essay to convince someone that the Face is just a natural landform.Be sure to include: claims to support your argument that the Face is a natural landform; evidence from the article to support your claims; an introduction, a body, and a conclusion to your argumentative essay.",
    "Some of your friends perform community service. For example, some tutor elementary school children and others clean up litter. They think helping the community is very important. But other friends of yours think community service takes too much time away from what they need or want to do. Your principal is deciding whether to require all students to perform community service. Write a letter to your principal in which you take a position on whether students should be required to perform community service. Support your position with examples.",
    "Your principal is considering changing school policy so that students may not participate in sports or other activities unless they have at least a grade B average. Many students have a grade C average. She would like to hear the students' views on this possible policy change. Write a letter to your principal arguing for or against requiring at least a grade B average to participate in sports or other activities. Be sure to support your arguments with specific reasons.",
    "In the article 'Driverless Cars are Coming,' the author presents both positive and negative aspects of driverless cars. Using details from the article, create an argument for or against the development of these cars. Be sure to include: your position on driverless cars; appropriate details from the article that support your position; an introduction, a body, and a conclusion to your argumentative essay.",
    "Write a letter to your state senator in which you argue in favor of keeping the Electoral College or changing to election by popular vote for the president of the United States. Use the information from the texts in your essay. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to include a claim; address counterclaims; use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your response in the space provided.",
    "Your principal is reconsidering the school's cell phone policy. She is considering two possible policies: Policy 1: Allow students to bring phones to school and use them during lunch periods and other free times, as long as the phones are turned off during class time. Policy 2: Do not allow students to have phones at school at all. Write a letter to your principal convincing her which policy you believe is better. Support your position with specific reasons.",
    "Some schools offer distance learning as an option for students to attend classes from home by way of online or video conferencing. Do you think students would benefit from being able to attend classes from home? Take a position on this issue. Support your response with reasons and examples.",
    "When people ask for advice, they sometimes talk to more than one person. Explain why seeking multiple opinions can help someone make a better choice. Use specific details and examples in your response.",
]
sentences = [x.lower() for x in sentences]

In [4]:
df = pd.read_parquet("input/preprocess.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   row_id          221612 non-null  int32 
 1   essay_id        221612 non-null  object
 2   generated       221612 non-null  int8  
 3   source          221612 non-null  object
 4   prompt          221612 non-null  object
 5   text            221612 non-null  object
 6   text_bsc        221612 non-null  object
 7   text_bow        221612 non-null  object
 8   text_bow_len    221612 non-null  int16 
 9   prompt_bsc      221612 non-null  object
 10  prompt_bow      221612 non-null  object
 11  prompt_bow_len  221612 non-null  int16 
dtypes: int16(2), int32(1), int8(1), object(8)
memory usage: 15.4+ MB


In [5]:
min_len = 4
col = "prompt_bow"
sentences = set(sentences)
sentences |= set(df[df[col].str.len()>=min_len][col])
sentences = list(sentences)
print(f"{len(sentences)} sentences\n{sentences[:20]}")

5162 sentences
['convert the following summary back into the original text the essay discusses the legality of abortion in the united states and how it is protected by the constitution the writer argues that women have a right to abortion because it is a personal decision and not a crime', 'there is little justification for society to make extraordinaiy efforts specially at a great cost in money and jobs to save endangered animal or plant species do you agree or disagree use specific reasons and examples to support your answer write an essay of roughly 500 words', 'write a essay that could have provided the following summary the purpose of this report is to analyze the case of the low fares airline ryanair and find out the answer to six specific questions from the case the report focuses on the key success factors the prospect of ryanair in the long haul routes the strategic leadership of michael o leary the environmental situation over recent years sustainability of ryanair s strategy

In [6]:
%%time
model = SentenceTransformer("huggingface/sentence-transformers/all-mpnet-base-v2", device=device)
model.max_seq_length = 384
em = model.encode(sentences=sentences, batch_size=512, show_progress_bar=True, convert_to_numpy=True)
print(f"em={em.shape}")

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

em=(5162, 768)
CPU times: user 1.69 s, sys: 1.33 s, total: 3.02 s
Wall time: 15.8 s


In [7]:
faiss.normalize_L2(em)

In [8]:
%%time
d = em.shape[1]
index = faiss.IndexFlatIP(d)  # InnerProduct
index.verbose = True
index.train(em)
index.add(em)

CPU times: user 672 µs, sys: 1.18 ms, total: 1.86 ms
Wall time: 1.74 ms


In [9]:
%%time
faiss.write_index(index, "output/prompts.index")

CPU times: user 136 µs, sys: 1.36 ms, total: 1.5 ms
Wall time: 2.65 ms


In [10]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:18.242698
