In [1]:
from datasets import load_dataset, load_metric

In [2]:
squad_datasets = load_dataset("squad")

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
squad_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
# this is what a single data-point needs to look like
squad_datasets["train"][0]

{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'title': 'University_of_Notre_Dame'}

In [5]:
import pandas as pd
import re
import string
import sys
import six

In [6]:
emoji_pattern = re.compile(
  "["
  u"\U0001F600-\U0001F64F"  # emoticons
  u"\U0001F300-\U0001F5FF"  # symbols & pictographs
  u"\U0001F680-\U0001F6FF"  # transport & map symbols
  u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
  u"\U00002702-\U000027B0"
  u"\U000024C2-\U0001F251"
  "]+", 
  flags=re.UNICODE
)

def text_preproc(x):
  x = x.lower()
  x = ' '.join(x.split(' '))
  x = x.encode('ascii', 'ignore').decode()
  x = re.sub(r'https?://\S+|www\.\S+', ' ', x, count=sys.maxsize) # urls
  x = re.sub(r'<.*?>', ' ', x, count=sys.maxsize) # html tags
  x = re.sub(r'@\S+', ' ', x, count=sys.maxsize)
  x = re.sub(r'#\S+', ' ', x, count=sys.maxsize)
  x = re.sub(r'\'\w+', '', x, count=sys.maxsize)
  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x, count=sys.maxsize)
  x = re.sub(r'\w*\d+\w*', '', x, count=sys.maxsize)
  x = re.sub(r'\s{2,}', ' ', x, count=sys.maxsize)
  x = re.sub(r':?\\+\S', '', x, count=sys.maxsize)
  x = emoji_pattern.sub(r'', x, count=sys.maxsize)
  return x

In [7]:
monster_raw_df = pd.read_csv("monster_com-job_sample.csv")

In [8]:
print(monster_raw_df.columns)
monster_raw_df[["job_description", "job_title"]].head()

Index(['country', 'country_code', 'date_added', 'has_expired', 'job_board',
       'job_description', 'job_title', 'job_type', 'location', 'organization',
       'page_url', 'salary', 'sector', 'uniq_id'],
      dtype='object')


Unnamed: 0,job_description,job_title
0,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison
1,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison
2,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...
3,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon
4,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill


In [9]:
dice_raw_df = pd.read_csv("/content/Dice_US_jobs.csv", encoding = 'ISO-8859-1')

In [10]:
print(dice_raw_df.columns)
dice_raw_df[["job_description", "job_title", "sector"]].head()

Index(['country_code', 'date_added', 'job_board', 'job_description',
       'job_title', 'job_type', 'location', 'organization', 'page_url',
       'phone_number', 'salary', 'sector'],
      dtype='object')


Unnamed: 0,job_description,job_title,sector
0,"Minimum Required Skills:EDI, TrustedLink, AS2,...",EDI Analyst,"EDI, TrustedLink, AS2, VAN - EDI, TrustedLink,..."
1,"InformaticaåÊ/ ETL DeveloperSt, Petersburg, FL...",Informatica ETL Developer,ETL Informatica B2B Data Exchange Netezza Orac...
2,pmayekar@kanandcorp.com 512-697-8897Sunnyvale ...,Angular developer,Angular
3,This nationally recognized Microsoft Gold Part...,"Microsoft Dynamics AX, Project Manager","Microsoft Dynamics AX, Project Manager - Toron..."
4,"Minimum Required Skills:C#, ASP.NET, SQL, Java...",Software Developer,"C#, ASP.NET, SQL, JavaScript, MVC - C#, ASP.NE..."


In [11]:
!wget -q https://github.com/WING-NUS/JD2Skills-BERT-XMLC/raw/main/data/mycareersfuture.tar.gz
!tar vxf ./mycareersfuture.tar.gz

mycareersfuture.json


In [12]:
import json
with open("mycareersfuture.json") as f:
    skill_jd_raw = json.load(f)

In [14]:
from random import sample
sample(skill_jd_raw["jobs"], 1)

[{'company_info': 'RHB Securities Singapore Pte. Ltd. is a prominent stockbroking house with over 20 years of solid track record in Singapore. We are well placed to serve our customers’ brokerage and other financial needs both locally and regionally.\nAs part of the RHB Banking group, Malaysia’s fourth largest financial services group which is set to become one of ASEAN’s Leading Multinational Financial Services Group by 2020, we provide a comprehensive network which enables us to command a leading presence among retail and institutional investors.\n',
  'company_name': 'RHB SECURITIES SINGAPORE PTE. LTD.',
  'employment_type': 'Permanent',
  'expiry_date': '06 Jul 2019',
  'job_category': ['Banking and Finance'],
  'job_id': 'JOB-2019-0046780',
  'job_requirements': 'requirementsjob requirements degree holder least 2 years relevant working experience securities industry preferred graduate finance related welcome apply possess good command english written spoken team player able work i

In [15]:
!wget -q https://github.com/chunchentu/amazon_job_skill/raw/master/amazon_jobs_skill_preprocessed.csv

In [16]:
amazon_raw_df = pd.read_csv("/content/amazon_jobs_skill_preprocessed.csv")

In [17]:
print(amazon_raw_df.columns)
amazon_raw_df.iloc[[0]].to_dict()

Index(['Index', 'Title', 'Country', 'State', 'City', 'Posting_year',
       'Posting_month', 'Posting_day', 'DESCRIPTION', 'Posting_date',
       'BASIC QUALIFICATIONS', 'PREFERRED QUALIFICATIONS', 'mdy'],
      dtype='object')


{'BASIC QUALIFICATIONS': {0: "· Proven track record of hiring and managing high performing engineering teams· Breadth and depth of experience building and managing software systems and teams· · Excellent written and verbal communication skills with the ability to present complex technical information in a clear and concise manner to executives and non-technical leaders· · Willingness to roll up your sleeves and do what's necessary to drive your projects to completion· · 6+ years of experience in Software development and 4+ years as a Development Manager· · Strong technical credentials, with expertise in OOP, Java or C++· · Experience with Linux/Unix systems· · Experience with SQL and No SQL data stores· · Bachelor's Degree in Computer Science or related degree"},
 'City': {0: ' Seattle'},
 'Country': {0: 'US'},
 'DESCRIPTION': {0: 'You are an experienced hands-on manager with a background in developing and delivering software that simplifies solutions for a broad set of related problem

In [18]:
!wget -q https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2021.zip

In [19]:
!unzip stack-overflow-developer-survey-2021.zip

Archive:  stack-overflow-developer-survey-2021.zip
  inflating: README_2021.txt         
  inflating: so_survey_2021.pdf      
replace survey_results_public.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace survey_results_schema.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [20]:
stackoverflow_raw_df = pd.read_csv("/content/survey_results_public.csv")
stackoverflow_raw_df.sample(1).to_dict()

{'Accessibility': {2076: 'None of the above'},
 'Age': {2076: '35-44 years old'},
 'Age1stCode': {2076: '18 - 24 years'},
 'CompFreq': {2076: 'Yearly'},
 'CompTotal': {2076: 70000.0},
 'ConvertedCompYearly': {2076: 52942.0},
 'Country': {2076: 'Canada'},
 'Currency': {2076: 'CAD\tCanadian dollar'},
 'DatabaseHaveWorkedWith': {2076: 'MySQL'},
 'DatabaseWantToWorkWith': {2076: 'MySQL'},
 'DevType': {2076: 'Developer, front-end'},
 'EdLevel': {2076: 'Bachelor’s degree (B.A., B.S., B.Eng., etc.)'},
 'Employment': {2076: 'Employed full-time'},
 'Ethnicity': {2076: 'White or of European descent'},
 'Gender': {2076: 'Man'},
 'LanguageHaveWorkedWith': {2076: 'HTML/CSS;JavaScript;PHP'},
 'LanguageWantToWorkWith': {2076: 'HTML/CSS;JavaScript;Node.js;PHP;TypeScript'},
 'LearnCode': {2076: 'School'},
 'MainBranch': {2076: 'I am a developer by profession'},
 'MentalHealth': {2076: 'None of the above'},
 'MiscTechHaveWorkedWith': {2076: nan},
 'MiscTechWantToWorkWith': {2076: nan},
 'NEWCollabToolsH

In [21]:
!wget -q https://github.com/et4m1r/job_skills_data/raw/main/data.json -O job_skills.json
import json
with open("job_skills.json") as f:
    skills_list_raw = json.load(f)

In [22]:
print(skills_list_raw.keys())
skills_list_raw['Engineering & Science'][:5]

dict_keys(['Websites, IT & Software', 'Writing & Content', 'Design, Media & Architecture', 'Data Entry & Admin', 'Engineering & Science', 'Sales & Marketing', 'Business, Accounting, Human Resources & Legal', 'Product Sourcing & Manufacturing', 'Mobile Phones & Computing', 'Translation & Languages', 'Local Jobs & Services', 'Freight, Shipping & Transportation', 'Telecommunications', 'Education', 'Others'])


[{'name': 'Engineering'},
 {'name': 'AutoCAD'},
 {'name': 'Electrical Engineering'},
 {'name': 'Electronics'},
 {'name': 'Machine Learning'}]

In [23]:
!wget -q https://github.com/pragatikumar/Google-Job-Skills-Analysis/raw/master/job_skills.csv -O google_job_skills.csv
google_job_raw_df = pd.read_csv("/content/google_job_skills.csv")
google_job_raw_df.sample(1).to_dict()

{'Category': {297: 'Program Management'},
 'Company': {297: 'Google'},
 'Location': {297: 'Singapore'},
 'Minimum Qualifications': {297: "Bachelor's degree or equivalent practical experience.\n5 years of experience in program management, partner development and/or channel sales roles in the business technology market."},
 'Preferred Qualifications': {297: "Master's degree in a technical discipline (e.g. Computer Science/Software Engineering) or MBA.\n10 years of partner programs experience at an Enterprise Software (or Cloud) company, with experience in competitive partner programs.\nRecent experience in transition channel models to such as Cloud, SaaS or Services and Advisory Programs.\nPassionate about the partners, possess the drive to achieve quick results, and have the capacity to assume increasing responsibility in a highly successful, fast-paced global organization.\nAbility to collaborate and build relationships with individuals of varying levels of experience and department fu

In [24]:
from random import sample
import re
import itertools

question_to_skills = dict() # a potential answer for a given question.
skill_to_questions = dict()

all_skills = []
salary_snippets = []
seniority_levels = []
experience_requirements = []
cities = []
education_levels = []
programming_languages = []
databses = []
industries = []

# extract dice snippets
all_skills += dice_raw_df.sector.dropna().to_list()

# extract snippets from mycareersfuture
for job_details in skill_jd_raw["jobs"]:
  all_skills += job_details["skills_required"]
  seniority_levels += [job_details["seniority"]]
  experience_requirements += [job_details["min_experience"]]
  salary_snippets += [job_details["salary"]]
  industries += job_details["job_category"]

experience_requirements += [f"{i} years of experience" for i in range(1, 45)]
experience_requirements += [f"{i} years" for i in range(1, 45)]
experience_requirements += [f"{i}yrs." for i in range(1, 45)]
experience_requirements += [f"{i}yr minumum" for i in range(1, 45)]
experience_requirements += [f"{i} years minumum" for i in range(1, 45)]
experience_requirements += [f"{i} yrs. required" for i in range(1, 45)]

# extract snippets from amazon job data
cities += amazon_raw_df.City.dropna().to_list()

# extract stackoverflow snippets
education_levels += stackoverflow_raw_df.EdLevel.dropna().to_list()
programming_languages += stackoverflow_raw_df.LanguageHaveWorkedWith.dropna().to_list()
all_skills += stackoverflow_raw_df.LanguageHaveWorkedWith.dropna().to_list()
programming_languages += stackoverflow_raw_df.LanguageWantToWorkWith.dropna().to_list()
all_skills += stackoverflow_raw_df.LanguageWantToWorkWith.dropna().to_list()
databses += stackoverflow_raw_df.DatabaseHaveWorkedWith.dropna().to_list()
all_skills += stackoverflow_raw_df.DatabaseHaveWorkedWith.dropna().to_list()
all_skills += stackoverflow_raw_df.ToolsTechHaveWorkedWith.dropna().to_list()
all_skills += stackoverflow_raw_df.ToolsTechWantToWorkWith.dropna().to_list()

industries += list(skills_list_raw.keys())
for industry_skills in skills_list_raw.values():
  all_skills += [s["name"] for s in industry_skills]

# google job snippets
industries += google_job_raw_df["Category"].dropna().to_list()
cities += google_job_raw_df["Location"].dropna().to_list()

snippets = {}
namespace = locals()
for key, arr in [
  ("all_skills", all_skills),
  ("seniority_levels", seniority_levels),
  ("experience_requirements", experience_requirements),
  ("cities", cities),
  ("education_levels", education_levels),
  ("programming_languages", programming_languages),
  ("databses", databses),
  ("industries", industries),
]:
  arr = list(itertools.chain.from_iterable([re.split(r'[,/;&]', s) for s in arr]))
  arr = [s.strip().lower() for s in arr]
  arr = list(set(arr))
  print(sample(arr, 5))
  snippets[key] = arr

snippets["salary_snippets"] = list(set([s.strip().lower() for s in salary_snippets]))

snippets["databses"][4]

['lesson plans', 'xilinx', 'c# or c++', 'ran call testing', 'mobile ui']
['senior executive', 'not_available', 'fresh', 'senior management', 'junior executive']
['24 years', '35 years', '15 years minumum', '10 years  exp', '21 years']
['japan', 'københavn', 'westborough', 'columbus', 'boulder']
['etc.)', 'german realschule or gymnasium', 'master’s degree (m.a.', 'a.s.', 'b.s.']
['julia', 'apl', 'html', 'c#', 'c']
['ibm db2', 'microsoft sql server', 'cassandra', 'postgresql', 'oracle']
['sales', 'laboratory', 'business', 'personal care', 'customer support']


'redis'

In [25]:
jds = []

jds += dice_raw_df["job_description"].dropna().to_list()
jds += monster_raw_df["job_description"].dropna().to_list()

jds += [j["job_requirements"] for j in skill_jd_raw["jobs"]]
jds += [j["requirements_and_role"] for j in skill_jd_raw["jobs"]]

jds += amazon_raw_df.DESCRIPTION.dropna().to_list()
jds += amazon_raw_df['PREFERRED QUALIFICATIONS'].dropna().to_list()
jds += amazon_raw_df['BASIC QUALIFICATIONS'].dropna().to_list()

jds += google_job_raw_df['Minimum Qualifications'].dropna().to_list()
jds += google_job_raw_df['Responsibilities'].dropna().to_list()
jds += google_job_raw_df['Preferred Qualifications'].dropna().to_list()

jds = [text_preproc(j) for j in jds]
jds = [j for j in jds if j]
jds = list(set(jds))

In [26]:
# https://discuss.huggingface.co/t/question-answering-bot-fine-tuning-with-custom-dataset/4412/5

In [32]:
import json
from datasets import load_dataset
from uuid import uuid4

questions = {
    "What programming language do I need to know?": snippets["programming_languages"],
    "What programming languages are used?": snippets["programming_languages"],
    "What languages are used?": snippets["programming_languages"],
    "What is the backend stack?": snippets["programming_languages"],

    "How much is the pay?": snippets["salary_snippets"],
    "what is the salary?": snippets["salary_snippets"],
    "what is the pay?": snippets["salary_snippets"],

    "What is the database?": snippets["databses"],
    "How is the data stored?": snippets["databses"],
    "What db is used?": snippets["databses"],

    "How senior is this role?": snippets["seniority_levels"],
    "is this a senior level job?": snippets["seniority_levels"],
    "what level is the job?": snippets["seniority_levels"],

    "what are the requirements?": snippets["all_skills"],
    "what skills do i need?": snippets["all_skills"],
    "where is the job?": snippets["cities"],
    "how many years of exp do i need?": snippets["experience_requirements"],
    "how much experience do i need?": snippets["experience_requirements"],
    "what level is the job?": snippets["seniority_levels"],
}

from typing import List
from random import sample

def get_datapoints(question: str):
  possible_ans = questions[question]
  title = f"{hash(question)}"

  with open(f"jd-qa-lg-{title}.json", "w+") as out_file:
    for jd in jds:
      context = jd.lower().strip()
      if not context:
        continue
      
      for answer in possible_ans:
        answer = answer.lower().strip()
        if answer in ["c++", '', None]:
          continue
        starts_ends = None
        try:
          starts_ends = [
            (ele.start(), ele.end() - 1) 
            for ele in re.finditer(rf'\b({answer})\b', context)
          ]
        except:
          pass
        if not starts_ends:
          continue
        answers = {}
        idx = f"{uuid4()}"
        answers["text"] = [answer for _ in starts_ends]
        answers["answer_start"] = [start for start, _ in starts_ends]
        _new = {
            "id": idx,
            "title": title,
            "context": context,
            "question": question,
            "answers": answers,
        }
        out_file.write("%s\n" % (json.dumps(_new)))
  return []

import multiprocessing
pool = multiprocessing.Pool(processes=16)
all_training_data = pool.map(get_datapoints, questions.keys())

Process ForkPoolWorker-35:
Process ForkPoolWorker-44:
Process ForkPoolWorker-42:
Process ForkPoolWorker-43:
Process ForkPoolWorker-34:
Process ForkPoolWorker-45:
Process ForkPoolWorker-48:
Process ForkPoolWorker-36:
Process ForkPoolWorker-33:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-40:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Process ForkPoolWorker-41:
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/

KeyboardInterrupt: ignored

  File "/usr/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.7/multiprocessing/synchronize.py", line 95,

In [None]:
# flat_training_payloads = list(itertools.chain.from_iterable(all_training_data))
# sample(flat_training_payloads, 2)

In [None]:
# output_filename = "jd-qa-train-lg.json"

# # with open(output_filename, "w+") as f:
# #   f.write(json.dumps(datapoints))
# with open(output_filename, "w+") as f:
#   for dp in flat_training_payloads:
#     f.write("%s\n" % (json.dumps(dp)))

In [None]:
!export fn="jd-qa-train-med.json" && du -h $fn && head -n2 $fn && tail -n2 $fn && wc -l $fn

In [None]:
# ctx ='Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'
# ans = 'Saint Bernadette Soubirous'

# [
#   (ele.string, ele.start(), ele.end() - 1) 
#   for ele in re.finditer(rf'\b{ans}\b', ctx)
# ]
# , 'answer_start': [515]}}

In [29]:
ds = load_dataset("json", data_files="jd-qa-lg-*.json")

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Using custom data configuration default-a936938fc87df245


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-a936938fc87df245/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a936938fc87df245/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [31]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8276
    })
})

In [30]:
ds["train"][0]

{'answers': {'answer_start': [1422], 'text': ['powerpoint']},
 'context': 'role qa etl tester with hedislocation nc raleighduration contract monthsposition qty gc citizenkey skillset etl ibm datastage oracle unix shell scripts and perl scriptsjob description what software tools skills are needed to perform these daily responsibilities years of testing quality assurance experience on etl data warehousing projects of which at least years as a senior test engineer on etl projects years of relational database experience covering writing queries manipulating test data through sql statements experience using testing quality assurance tools such as alm experience using agile scrum tools such as ca agile central aka rally experience using job scheduling software such as tws experience with automating test cases and automation tools such as fitnesse selenium quicktestpro qtp proven experience in testing software following agile scrum methodology experience working in an onsite offshore model wi

In [None]:
ds["train"].train_test_split()["train"][0]

In [None]:
squad_datasets["train"][0]

In [None]:
!head -c 500 /content/dev-v2.0.json