In [1]:
import os
import json
import gc
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scml
from scml import pandasx as pdx
from mylib.ner import NerDataset
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
df = pd.read_csv("input/newtonbaba/pii_gemini_v1.csv", low_memory=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3779 entries, 0 to 3778
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      3779 non-null   int64 
 1   Essay           3779 non-null   object
 2   EMAIL           3779 non-null   object
 3   USERNAME        3779 non-null   object
 4   ID_NUM          3779 non-null   object
 5   PHONE_NUM       3779 non-null   object
 6   URL_PERSONAL    3779 non-null   object
 7   STREET_ADDRESS  3779 non-null   object
dtypes: int64(1), object(7)
memory usage: 236.3+ KB


In [4]:
df.sample(10).head(10)

Unnamed: 0.1,Unnamed: 0,Essay,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
968,1146,"In the realm of academia, as a dedicated scholar seeking to unveil the intricacies of modern society, I, Brian Douglas, embark on a literary journey that delves into the kaleidoscope of experiences that shape the human condition. My pursuit of knowledge has been an ongoing endeavor, intertwined with my personal and educational experiences.\n\nAs I navigate the corridors of higher learning, I carry with me the insights I have gleaned from my interactions with the world around me. My email address, myersmanuel, serves as a gateway to my digital footprint, where I share my thoughts and engage in intellectual discourse. The unique identifier 5M 46794 assigned to me by the institution connects me to a network of scholars and resources that empower my academic growth.\n\nBeyond the confines of the classroom, I maintain open lines of communication through my phone numbers, 603.910.6095 and +1-801-791-6017x346. These digits provide a lifeline to my support system, allowing me to stay connected with family, friends, and mentors who inspire me on my intellectual journey.\n\nVenturing into the digital realm, I share glimpses of my personal life on Facebook, where I find solace in connecting with like-minded individuals under the handle @dylan50. Through this online platform, I engage in thought-provoking discussions and share my perspectives on social issues, fostering a sense of community and shared purpose.\n\nAs I continue my academic explorations, the address 722 Cheryl Green, Barbaraport, OK 08532 serves as my sanctuary, providing a tranquil haven where I can delve into the depths of my research. It is within the confines of my home that I find solace, inspiration, and the unwavering support of those who believe in my dreams.\n\nIn conclusion, my identity as a scholar is inextricably linked to the personal experiences that have shaped me. The information provided—my name, email, ID number, phone numbers, social media handles, and street address—forms a mosaic that illuminates the multifaceted nature of my being. As I continue to explore the complexities of the human condition, I am grateful for the opportunities that have been bestowed upon me and the unwavering support of those who stand by my side.",['sbowman@yahoo.com'],['myersmanuel'],['5M 46794'],"['603.910.6095', '+1-801-791-6017x346']",['https://facebook.com/dylan50'],"['722 Cheryl Green\nBarbaraport, OK 08532']"
2085,2518,"The advent of technology has revolutionized the way we live, communicate, and learn. For students like myself, Sean Johnson, accessing educational resources has become more convenient than ever before. With just a few clicks, I can delve into online lectures, engage in virtual discussions, and collaborate with classmates from the comfort of my home at 285 Gay Hollow, Jeffreyville, AL 66953.\n\nThe internet has empowered me to transcend geographical barriers and connect with experts and peers from around the globe. Through platforms like GitHub (https://github.com/nicolas61), I can share my projects, seek feedback, and contribute to open-source initiatives. This global connectivity has broadened my perspective and fostered a sense of community among individuals with shared interests.\n\nMoreover, the accessibility of online resources has democratized education, making it more inclusive and equitable. Students with diverse backgrounds and circumstances can now access the same high-quality educational content and support services. Whether I am studying for an exam at my other residence at 3087 Lisa Meadow, Johnsonbury, MP 34409, or seeking guidance from online tutors, technology has leveled the playing field for all learners.\n\nThe convenience and flexibility of online learning have also enhanced my work-life balance. As a part-time student, I can schedule my study sessions around my work commitments. This flexibility allows me to pursue my educational goals without sacrificing my professional aspirations. Additionally, the ability to access course materials anytime, anywhere has facilitated my learning process and enabled me to make the most of my limited study time.\n\nIt is important to note that while technology has undoubtedly enhanced the student experience, it also comes with certain challenges. Issues with internet connectivity or access to devices can hinder our ability to participate effectively in online courses. Additionally, the lack of face-to-face interactions can make it difficult to build rapport with classmates and instructors.\n\nTo mitigate these challenges, reliable internet access should be ensured for all students. Furthermore, educational institutions should explore innovative ways to foster a sense of community and collaboration among online learners. By addressing these issues, we can create a truly inclusive and effective virtual learning environment.\n\nIn conclusion, the advent of technology has transformed the learning landscape, providing students with unprecedented access to educational resources and opportunities. While challenges remain, the benefits of online learning far outweigh the drawbacks. With continued advancements in technology and a commitment to equity and inclusion, we can harness the power of the internet to empower students like myself and create a more just and accessible education system for all.",['scott02@gmail.com'],['michaelcollins'],['B94 0KQ'],['960.400.9524x48294'],['https://github.com/nicolas61'],"['285 Gay Hollow\nJeffreyville, AL 66953', '3087 Lisa Meadow\nJohnsonbury, MP 34409']"
2100,2535,"In the realm of education, the pursuit of knowledge and personal growth intertwines with the unique experiences of each individual. As a testament to this journey, I, [NAME_STUDENT], embark on a reflective exploration of my own educational path, guided by the thread of personal information woven into the fabric of my academic life.\n\nMy academic identity finds its roots in a series of identifiers that both connect and distinguish me. My student ID, CXDC08363636161801, serves as a unique code that marks my presence within the institution. The phone number +1-433-857-3360x37222 is a lifeline that bridges the gap between my academic and personal spheres, allowing for seamless communication. My username, zellis, becomes an avatar in the digital landscape, representing my online presence and connecting me with a vast network of educators and peers.\n\nMy physical address, 90469 Margaret Inlet Suite 045, East Benjaminmouth, VT 31205, serves as the anchor of my academic and personal life. It is a haven where I find solitude for study and inspiration for intellectual exploration. Within these walls, the boundaries between learning and living blur, creating a space where knowledge and personal growth are inseparable.\n\nThese identifiers, while distinct in nature, form a constellation that illuminates my educational journey. They are not mere labels but rather fragments of a narrative that is both personal and profound. Each piece of information contributes to the tapestry of my academic experience, weaving together the threads of my identity, my aspirations, and my connection to the broader educational community.\n\nThrough this exploration, I have come to appreciate the intricate interplay between my personal attributes and my academic pursuits. The pursuit of knowledge is not a solitary endeavor but one that is deeply intertwined with the fabric of our lives. As we navigate the complexities of the educational landscape, it is essential to embrace the personal aspects that shape our journey. By doing so, we not only enhance our academic experience but also lay the foundation for a more fulfilling and meaningful life beyond the classroom.",['fernandezamber@yahoo.com'],[],['CXDC08363636161801'],['+1-433-857-3360x37222'],['https://linkedin.com/in/zellis'],"['90469 Margaret Inlet Suite 045\nEast Benjaminmouth, VT 31205']"
407,488,"**The Importance of Education: A Personal Reflection**\n\nDr. Stacy Tapia MD, a highly esteemed physician, has long recognized the transformative power of education. Her journey through academia has been marked by a deep commitment to learning and a belief in its ability to empower individuals and shape societies.\n\nAs a child growing up in Armstrongville, MT, Dr. Tapia was instilled with a love of knowledge and a thirst for inquiry. The cozy confines of her home at 145 Kelly Junction served as a hub for intellectual exploration, where she spent countless hours poring over books and engaging in stimulating conversations.\n\nHer passion for learning propelled her to pursue higher education, leading her to embark on a rigorous academic path. With unwavering determination, she navigated the complexities of her undergraduate and medical studies, all while maintaining an exceptional GPA. The challenges she encountered along the way only served to strengthen her resolve, as she believed that education was not merely an accumulation of facts but a profound journey of personal growth and transformation.\n\nThroughout her journey, Dr. Tapia has maintained an active online presence, utilizing social media platforms such as Instagram and LinkedIn to connect with fellow professionals and share her insights on healthcare. Her engagement with online communities has allowed her to expand her knowledge base, engage in thought-provoking discussions, and foster a sense of collaboration among healthcare professionals.\n\nHer dedication to education extends beyond her own personal pursuits. As a compassionate physician, Dr. Tapia understands the vital role that access to quality education plays in improving the well-being of individuals and communities. She frequently volunteers at local health clinics, providing medical care to underserved populations and advocating for their right to accessible and affordable healthcare.\n\nThrough her unwavering commitment to education, Dr. Tapia has not only realized her own potential but has also dedicated her life to empowering others. She serves as an inspiration to countless individuals, demonstrating that through perseverance, passion, and a deep belief in the power of education, we can unlock our potential and make a meaningful contribution to society.","['amandagarner@gmail.com', 'ghorton@gmail.com']",[],"['B06952210', '138-45-4182']",['905.944.3883x5649'],"['https://instagram.com/joshuahanson', 'https://linkedin.com/in/madisonlane']","['145 Kelly Junction\nArmstrongville, MT 95505', 'USCGC Hayes\nFPO AE 43879']"
3528,4215,"**Adapting to the Challenges of Being a Non-Traditional Student**\n\nAs I navigate the complexities of adulthood, I find myself embracing a unique journey as a non-traditional student. Juggling the responsibilities of a budding academic career with personal commitments has presented me with an array of challenges that have shaped my growth.\n\nMy path has been one of perseverance and adaptation. As Dawn Lindsey, I have had to balance the demands of work and family while pursuing my education. The days are long and the nights are short, but I am determined to make the most of this opportunity. I am grateful for the flexibility offered by online learning, which allows me to complete coursework at my own pace.\n\nAs christopher34 on social media platforms, I have found support and inspiration from a community of fellow non-traditional students. The encouragement and camaraderie I have experienced online has been invaluable. Sharing strategies for time management, navigating financial aid, and overcoming academic hurdles has strengthened my resolve to succeed.\n\nThe pursuit of knowledge has always been a passion of mine. At age 37, I chose to leave my comfortable career and embark on this educational adventure. Driven by a desire to expand my horizons and make a meaningful contribution to my community, I enrolled at the University of Phoenix with the student ID number E81273038.\n\nMy commitment to my studies is unwavering. I dedicate countless hours to attending virtual classes, completing assignments, and participating in online discussions. Despite the obstacles I face, I am determined to make the most of this experience. The sacrifices I am making now will undoubtedly bear fruit in the years to come.\n\nAs a non-traditional student, I have learned the importance of time management and self-discipline. My days are meticulously scheduled, allowing me to juggle my various responsibilities effectively. I have also developed strong organizational and communication skills, necessary for success in both my academic and professional endeavors.\n\nThe road ahead may present further challenges, but I am confident that the lessons I have learned and the support I have gained will see me through. With every chapter I complete, I move closer to my goal of completing my education. I am proud of the path I have chosen, and I am determined to make the most of this unique opportunity.",['anthony44@yahoo.com'],['christopher34'],['E81273038'],['+1-378-684-6847'],['https://twitter.com/heather73'],"['8793 Cherry Vista\nNew Isaacfort, HI 02285', '9984 Christine Groves Apt. 830\nSwansonside, MN 31467']"
1481,1794,"In the bustling metropolis of academia, amidst the myriad students navigating the intricacies of higher education, stands Jennifer Mason, a young woman whose unwavering determination and pursuit of knowledge have led her to this esteemed institution. Known by her online moniker of ""santiagomatthew,"" Jennifer bears the unique identifier of OOGK24556971271465, a secret code that grants her access to the vast repositories of knowledge that await her within these hallowed halls.\n\nThrough her deft use of the communication channels at her disposal, Jennifer maintains a vibrant online presence. Her YouTube channel, ""https://youtube.com/c/ahenderson,"" serves as a testament to her insatiable curiosity and her desire to share her insights with the broader community. With every post and comment, she weaves her words into a tapestry of intellectual discourse, engaging with fellow students and scholars alike.\n\nBeyond the digital realm, Jennifer's physical presence is nestled within the cozy confines of Unit 8026 Box 8411, located within the confines of DPO AA 33049. This humble abode provides a sanctuary for her to retreat into the depths of study, where the rustling of pages and the gentle tapping of keys accompany her journey towards enlightenment.\n\nAs Jennifer navigates the complexities of academic life, she remains steadfast in her commitment to excellence. The phone number 393-304-3177 serves as a lifeline, connecting her to a network of mentors, classmates, and resources that support her every step of the way. It is through these connections that she discovers opportunities for collaboration, engages in thought-provoking discussions, and gains invaluable feedback on her intellectual endeavors.\n\nDriven by a deep-seated passion for learning, Jennifer Mason embarks upon this academic voyage with an unwavering spirit. As she delves into the mysteries of knowledge, her thirst for enlightenment will guide her path, propelling her towards a future filled with boundless possibilities. In the annals of academia, her name will forever be etched as a testament to her determination, curiosity, and the transformative power of education.",['dmccarthy@yahoo.com'],['santiagomatthew'],['OOGK24556971271465'],['393-304-3177'],['https://youtube.com/c/ahenderson'],['Unit 8026 Box 8411\nDPO AA 33049']
1538,1858,"In the bustling halls of academia, amidst a tapestry of diverse perspectives, there emerges the distinctive narrative of Emily, a student whose journey is etched upon the annals of educational excellence. With a student ID number of 1XPI149, Emily navigates the corridors of knowledge, her path illuminated by an unyielding thirst for intellectual enrichment.\n\nBeyond the confines of the classroom, Emily's digital footprint extends across multiple platforms. As username ""keithangela"" on one platform, ""scott57"" on another, and ""amy78"" on yet another, she engages in vibrant online communities, exchanging ideas and fostering connections. Her personal website, https://instagram.com/lori45, showcases her passions and interests, inviting others to delve into her world.\n\nThrough the symphony of communication channels, Emily's voice resonates both near and far. Her phone number, (432)536-4103, echoes with the sound of thoughtful inquiries and spirited discussions. The international line, +1-488-948-2837x3003, connects her to a global network of scholars and mentors, expanding her horizons beyond geographical boundaries.\n\nEmily's physical presence finds its abode at 88859 Jessica Garden in Brownfurt, VI 56005. Amidst the tranquility of her home, she finds solace in the pursuit of knowledge, surrounded by books and the gentle hum of inspiration. Her previous addresses, 67952 Quinn Lights Suite 713 in Reyesborough, MP 14684; 796 Cassandra Camp Suite 721 in Carrview, WY 06828; and 9236 Steven Run in New Howard, NY 14093, have all played a part in shaping her intellectual journey.\n\nEmily's story is a testament to the transformative power of education. It is a narrative that unfolds in the classrooms, the virtual realm, and the spaces she inhabits. Through her unwavering pursuit of knowledge and her active engagement in diverse communities, Emily embodies the essence of a true scholar, leaving an enduring legacy upon the tapestry of academia.","['thomasmark@hotmail.com', 'lbutler@hotmail.com']","['keithangela', 'scott57', 'amy78']","['1XPI149', 'A28975427', 'GB09OTZX39629068520899']","['(432)536-4103', '+1-488-948-2837x3003']",['https://instagram.com/lori45'],"['88859 Jessica Garden\nBrownfurt, VI 56005', '67952 Quinn Lights Suite 713\nReyesborough, MP 14684', '796 Cassandra Camp Suite 721\nCarrview, WY 06828', '9236 Steven Run\nNew Howard, NY 14093']"
3114,3727,"In the realm of higher education, students like Gregory Mayer, with the username andrew90, embark on a quest for knowledge and personal growth. Each individual possesses a unique tapestry of experiences and perspectives, and Gregory's journey is no exception.\n\nAs fate would have it, Gregory's path has intertwined with a myriad of addresses, including 4110 Juan Cliff Suite 847 in Briannahaven, IN 37622; 0743 Rowland Flats Suite 013 in South Reginald, NM 77475; 84469 Curtis Rue Apt. 736 in Lake Amy, MP 44194; and finally, 833 Dana Walk Suite 271 in New Matthewtown, NM 36447. These diverse locations have undoubtedly shaped his worldview and enriched his understanding of the world.\n\nCommunication plays a vital role in the student experience, and Gregory is no stranger to the digital realm. His presence on LinkedIn, accessible through the URL https://linkedin.com/in/oliversarah, provides a glimpse into his professional aspirations and network. Moreover, his phone number, 363-954-1689x74241, serves as a conduit for both personal and academic connections.\n\nWithin the academic landscape, Gregory's ID number, IQ2 3404, serves as a unique identifier, reflecting his status as a valued member of the student body. This numerical designation grants him access to a plethora of resources, from course materials to campus facilities.\n\nAs Gregory traverses the corridors of academic life, he carries with him a wealth of knowledge and a deep-seated desire to make a meaningful contribution to society. His experiences, both within and beyond the classroom, have molded him into an exceptional student, poised to leave an enduring mark on the world. And as he continues his educational journey, Gregory's legacy will be forever intertwined with the tapestry of higher education.",[],['andrew90'],['IQ2 3404'],['363-954-1689x74241'],['https://linkedin.com/in/oliversarah'],"['4110 Juan Cliff Suite 847\nBriannahaven, IN 37622', '0743 Rowland Flats Suite 013\nSouth Reginald, NM 77475', '84469 Curtis Rue Apt. 736\nLake Amy, MP 44194', '833 Dana Walk Suite 271\nNew Matthewtown, NM 36447']"
720,859,"In the realm of academia, each student embarks on a unique journey of intellectual exploration and personal growth. Among them is Daniel Johnson, a dedicated scholar with the username ""richard35,"" who strives for excellence in both his academic pursuits and personal endeavors. Hailing from the bustling streets of West Destinyfort, Oklahoma, at 36619 Tyler Junctions, Daniel's unwavering determination is evident in every facet of his life.\n\nAs he navigates the intricate corridors of knowledge, Daniel's unwavering commitment to academic excellence shines through. His student ID number, GB64YXZM96781136524587, serves as a testament to his dedicated presence within the halls of learning. Through countless hours spent poring over textbooks and engaging in thought-provoking discussions, he has cultivated a deep understanding of various subjects, expanding his intellectual horizons with each passing day.\n\nDaniel's passion for learning extends beyond the confines of the classroom. He actively seeks opportunities for personal growth and enrichment. On the social media platform Instagram, he connects with like-minded individuals under the handle ""higginshector."" Through this online community, he shares his perspectives, engages in meaningful conversations, and broadens his understanding of the world around him.\n\nBeyond the digital realm, Daniel maintains a strong connection to his community. His phone number, 384.591.4303x62671, serves as a lifeline for friends, family, and fellow students. He values the power of human connection and readily extends a helping hand to those in need. His warm and approachable nature fosters a sense of camaraderie and support within his surroundings.\n\nAs Daniel embarks on the next chapter of his academic journey, he carries with him the unwavering support of those who believe in his potential. He is a testament to the transformative power of education and a role model for students aspiring to achieve their dreams. Through his dedication, determination, and unwavering spirit, Daniel Johnson is destined to make a meaningful impact on the world.",[],['richard35'],['GB64YXZM96781136524587'],['384.591.4303x62671'],['https://instagram.com/higginshector'],"['36619 Tyler Junctions\nWest Destinyfort, OK 08398']"
1956,2364,"In the realm of education, the pursuit of knowledge transcends the confines of the classroom and extends to the vast expanse of the digital sphere. As a student navigating this ever-evolving landscape, Robert Mendoza, known by the username robert51, has harnessed the power of technology to enhance his academic journey.\n\nWith the unique identifier IYNX63557758666378, Robert has meticulously curated an online presence that showcases his intellectual curiosity and thirst for knowledge. His personal website, accessible through the URL https://facebook.com/thomasgutierrez, serves as a repository of his insightful essays and thought-provoking reflections.\n\nThrough his unwavering dedication, Robert has earned the distinction of ID number 431495620, a testament to his academic achievements. His unwavering pursuit of excellence is evident in his meticulously crafted essays, which demonstrate his keen analytical skills and command of language.\n\nBeyond the virtual realm, Robert's passion for learning extends to tangible interactions. His physical address, USNS Huff\nFPO AE 95731, provides a glimpse into his personal life and underscores his commitment to education in all its forms.\n\nFor Robert, the realm of technology is not merely a tool for academic exploration; it is a bridge that connects him with a global community of learners. Through his active participation in online forums and virtual study groups, he engages in thought-provoking discussions and collaborates with peers from diverse backgrounds.\n\nThe phone number 228-342-0366 serves as a direct line of communication, fostering connections between Robert and his fellow students, educators, and mentors. Whether it's exchanging ideas, seeking guidance, or simply sharing a moment of camaraderie, Robert embraces the power of human interaction to enrich his educational experience.\n\nAs Robert continues to navigate the complexities of the modern educational landscape, he stands as a beacon of academic excellence, fueled by an unwavering passion for learning and a commitment to harnessing the transformative power of technology.","['williamssteven@gmail.com', 'ibarraallison@hotmail.com']",['robert51'],"['IYNX63557758666378', '431495620']",['228-342-0366'],['https://facebook.com/thomasgutierrez'],['USNS Huff\nFPO AE 95731']


In [12]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.926140
