In [1]:
import numpy as np
import pandas as pd
import string
from datasets import load_dataset
import nltk

In [2]:
def preprocess_dataset(dataset):
    # Convert from pyarrow to pandas df
    df = dataset.to_pandas()
    
    # Map letter answers to options list indices
    df['answer'] = df['answer'].map({'A':0, 'B':1, 'C':2, 'D':3})
    options_series = df['options']

    def is_valid_options(options_list):
        # Strip option, remove punctuation, return True if not all options are just numbers.
        is_valid = [not sent.strip().translate(str.maketrans('', '', string.punctuation)).isnumeric() for sent in options_list]
        return any(is_valid)

    # Map is_valid_options to every question
    is_valid_series = options_series.map(is_valid_options)
    num_dropped = is_valid_series.value_counts()[False]
    print(f'Dropped {num_dropped} rows with only numeric answers')
    return df[is_valid_series]


In [94]:
pd.set_option('display.max_colwidth', None)

def filt_df(df):
    
    ## Exclude phrase completion questions
    filt_df = df[ ~df['question'].str.contains('_')]
    print(filt_df.shape)

    ## Exclude "According to the passage" questions
    filt_df = filt_df[ ~filt_df['question'].str.contains('According to the passage')]
    print(filt_df.shape)

    ## Exclude questions shorter than 5 words
    filt_df = filt_df[ filt_df.question.str.replace(',','').str.split().str.len() > 5 ]
    print(filt_df.shape)
    
    ## Exclude contexts w/ less than n questions
    vc = filt_df['example_id'].value_counts().to_frame()
    vc.columns = ['Count']
    vc = vc[vc.Count > 0]
    filt_df = filt_df[filt_df['example_id'].isin(vc.index)]
    print(filt_df.shape)
    
    ## Exclude questions with answers shorter than 3 words
    filt_df['answer_text'] = filt_df.apply(lambda x: x['options'][x['answer']], axis=1)
    filt_df['answer_text_len'] = filt_df['answer_text'].str.replace(',','').str.split().str.len()
    filt_df = filt_df[ filt_df.answer_text_len >= 3 ]
    filt_df = filt_df.drop(['answer_text', 'answer_text_len'], axis=1)
    print(filt_df.shape)

    return filt_df

In [95]:
#pd.set_option('display.max_colwidth', None)

race = load_dataset('race', 'all')
race_train = race['train'].to_pandas()
#race_train

Reusing dataset race (/Users/Hasan/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [96]:
#valid_df = filt_df(preprocess_dataset(race['validation']))
train_df = filt_df(preprocess_dataset(race['train']))
train_df

Dropped 1304 rows with only numeric answers
(41197, 5)
(40312, 5)
(38187, 5)
(38187, 5)
(31941, 5)


Unnamed: 0,example_id,article,answer,question,options
3,high19088.txt,"Last week I talked with some of my students about what they wanted to do after they graduated, and what kind of job prospects they thought they had.\nGiven that I teach students who are training to be doctors, I was surprised do find that most thought that they would not be able to get the jobs they wanted without ""outside help"". ""What kind of help is that?"" I asked, expecting them to tell me that they would need a or family friend to help them out.\n""Surgery ,"" one replied.\nI was pretty alarmed by that response. It seems that the graduates of today are increasingly willing to go under the knife to get ahead of others when it comes to getting a job .\nOne girl told me that she was considering surgery to increase her height. ""They break your legs, put in special extending screws, and slowly expand the gap between the two ends of the bone as it re-grows, you can get at least 5 cm taller!""\nAt that point, I was shocked. I am short, I can't deny that, but I don't think I would put myself through months of agony just to be a few centimetres taller. I don't even bother to wear shoes with thick soles, as I'm not trying to hide the fact that I am just not tall!\nIt seems to me that there is a trend towards wanting ""perfection"" , and that is an ideal that just does not exist in reality.\nNo one is born perfect, yet magazines, TV shows and movies present images of thin, tall, beautiful people as being the norm. Advertisements for slimming aids, beauty treatments and cosmetic surgery clinics fill the pages of newspapers, further creating an idea that ""perfection"" is a requirement, and that it must be purchased, no matter what the cost. In my opinion, skills, rather than appearance, should determine how successful a person is in his/her chosen career.",1,Which' s the best title for the passage?.,"[Young Graduates Have Higher Expectations, Young Graduates Look to Surgery for Better Jobs, Young Graduates' Opinion About Cosmetic Surgery, Young Graduates Face a Different Situation in Job-hunting]"
4,high15596.txt,"YUZHOU, HENAN -An accident in a central China coal mine killed 21 miners Saturday and left another 16 trapped underground , the government said.\nThe death\nrose to 26 Sunday morning as rescuers were battling to reach the 11 miners who were still trapped underground, rescue headquarters said.\nRescuers were battling to reach the 11 miners still trapped underground, but chances for them to survive were very slim, said Du Bo, deputy chief of the rescue headquarters.\n""Based upon past experience, the remaining 11 miners could be buried in coal dust, so the survival chances are frail,"" Du said.\nMore than 2,500 tons of coal dust smothered the pit after the gas leak , which hampered the rescue, said Du.\nThe gas outburst happened at 6:03 a.m. Saturday when 276 miners were working underground in the mine in Yuzhou City. A total of 239 workers escaped but 21 were found dead and 16 trapped.\nAn initial investigation showed that 173,500 cubic meters of gas leaked out in the accident. Liu Wenbin, a deputy chief engineer of the company that owns the mine, was in the pit when the accident happened. He organized the escape.\n""At around 6 a.m., I felt there was something wrong with the airflow in the shaft, and one of the team captains told me he also felt it and had already reported the problem,"" said Liu\nThe mine is owned by Pingyu Coal & Electric Co. Ltd., a company jointly established by four investors, including Zhong Ping Energy Chemical Group and China Power Investment Corp.",1,What could be the best title for this passage?,"[Death Toll Rises in an Accident in China, A Coal Mine Accident in Central China, An Accident in Central China, Coal Mine Accidents in China]"
6,high15596.txt,"YUZHOU, HENAN -An accident in a central China coal mine killed 21 miners Saturday and left another 16 trapped underground , the government said.\nThe death\nrose to 26 Sunday morning as rescuers were battling to reach the 11 miners who were still trapped underground, rescue headquarters said.\nRescuers were battling to reach the 11 miners still trapped underground, but chances for them to survive were very slim, said Du Bo, deputy chief of the rescue headquarters.\n""Based upon past experience, the remaining 11 miners could be buried in coal dust, so the survival chances are frail,"" Du said.\nMore than 2,500 tons of coal dust smothered the pit after the gas leak , which hampered the rescue, said Du.\nThe gas outburst happened at 6:03 a.m. Saturday when 276 miners were working underground in the mine in Yuzhou City. A total of 239 workers escaped but 21 were found dead and 16 trapped.\nAn initial investigation showed that 173,500 cubic meters of gas leaked out in the accident. Liu Wenbin, a deputy chief engineer of the company that owns the mine, was in the pit when the accident happened. He organized the escape.\n""At around 6 a.m., I felt there was something wrong with the airflow in the shaft, and one of the team captains told me he also felt it and had already reported the problem,"" said Liu\nThe mine is owned by Pingyu Coal & Electric Co. Ltd., a company jointly established by four investors, including Zhong Ping Energy Chemical Group and China Power Investment Corp.",2,"According to the writer, which of the following is not true?","[The mine was owned by more than one company, There was at least one more similar accident happening in Central China before, Before the accident happened there was no sign of something wrong, When the accident happened one of the mine owners was in the pit]"
7,high4558.txt,"Understanding the process of making career choices and managing your career is a basic life skill that everyone should understand.\nYour career decisions have such a profound effect on all aspects of your life. It's important to have the knowledge and resources needed to make smart, informed decisions. Whether you are looking for a new job, aiming to take the next step at your current job or planning your retirement options, you are making career decisions. Using good resources and the guidance of a career counselor can help you to make those decisions well.\nMany people mistakenly believe that choosing a career is a one-time event that happens some time in early adulthood. However, career management is actually a life-long process, and we continue to make consequential career choices over the years. When people want to take action in their career, career management and job search are about so much more than writing a good resume. If you learn about and act on the following areas of career management, you'll be rewarded throughout your career.\nYour interests, abilities, values, personal needs and realities should all be taken into account in any career decision making process. You spend countless hours at work, and it impacts your life in so many ways; it makes sense that you should be fully informed before making such profound decisions.\nDo you know how many different career choices are available to you? Both The Dictionary of Occupational Titles (American) and The National Occupational Classification (Canadian) list well over 20,000 different job titles. So unless you've actively explored a variety of career options, there's a very good chance that there are great possibilities available to you, and you don't even realize they exist.\nMatch your understanding of yourself with your understanding of possible career options. Once you have developed a good understanding of yourself, you will be able to combine that self-knowledge with your career and labor market research to determine potential careers that are a great fit for you.\nWhen you've made a well informed decision, then you're ready to make it happen. Making use of good career guidance and resources will help you to acquire the education, skills, and experience needed to get the job and learn and implement effective job search strategies.\nTime spent understanding your needs, researching your career options and developing outstanding job search skills, guided by great career resources, is a powerful investment in your future.",1,What's the main idea of the passage?,"[In the process of making career decisions, people should consider interests, abilities, values, personal needs and realities., All the people should have a good knowledge of how to make career choices and manage their career., Career decisions have a great impact on people's life., There are various possibilities available to you of making career decisions.]"
10,high4558.txt,"Understanding the process of making career choices and managing your career is a basic life skill that everyone should understand.\nYour career decisions have such a profound effect on all aspects of your life. It's important to have the knowledge and resources needed to make smart, informed decisions. Whether you are looking for a new job, aiming to take the next step at your current job or planning your retirement options, you are making career decisions. Using good resources and the guidance of a career counselor can help you to make those decisions well.\nMany people mistakenly believe that choosing a career is a one-time event that happens some time in early adulthood. However, career management is actually a life-long process, and we continue to make consequential career choices over the years. When people want to take action in their career, career management and job search are about so much more than writing a good resume. If you learn about and act on the following areas of career management, you'll be rewarded throughout your career.\nYour interests, abilities, values, personal needs and realities should all be taken into account in any career decision making process. You spend countless hours at work, and it impacts your life in so many ways; it makes sense that you should be fully informed before making such profound decisions.\nDo you know how many different career choices are available to you? Both The Dictionary of Occupational Titles (American) and The National Occupational Classification (Canadian) list well over 20,000 different job titles. So unless you've actively explored a variety of career options, there's a very good chance that there are great possibilities available to you, and you don't even realize they exist.\nMatch your understanding of yourself with your understanding of possible career options. Once you have developed a good understanding of yourself, you will be able to combine that self-knowledge with your career and labor market research to determine potential careers that are a great fit for you.\nWhen you've made a well informed decision, then you're ready to make it happen. Making use of good career guidance and resources will help you to acquire the education, skills, and experience needed to get the job and learn and implement effective job search strategies.\nTime spent understanding your needs, researching your career options and developing outstanding job search skills, guided by great career resources, is a powerful investment in your future.",2,Which of the following statements is NOT TRUE according to the passage?,"[Your interests in the occupation you choose are vital., Good career guidance and resources can help you gain the skills and experience., You are to make significant decisions without good resources and the guidance of a career adviser., Planning your retirement options is related to career management.]"
...,...,...,...,...,...
87843,middle7687.txt,"When travelling.you are sure to try some exciting new kinds of food.The Wild Food Festival,in the town of Hokitika,the west of Coast of New Zealand,gives you the chance to try some strange food.It is a celebration of the areas special lifestyle and food.And it celebrates food that most people might not want to eat.It is held in March every year.\nAt the festival you will find huhu grubs and beetles on your plate.The festival also celebrates Maori food. the food of the traditional native people of New Nealand And visitors will eat the wild food with plenty of famous West Coast beer.What's more,there are three stages at the festival,where there is live music and entertainment an day long.\nIf you have the chance to travel to Hokitita during the Wild Food Festival,you should book a hotel before it begins.or you can choose to stay at local schools.A number of local schools become camping grounds over the weekend of the festival.You can also stay in Greymouth,because there are buses from Greymouth to the festival.",0,What will you find on your plate at the festival?,"[Huhu grubs and beetles., West Coast beer., Fish and chicken, Pork and eggs.]"
87844,middle7687.txt,"When travelling.you are sure to try some exciting new kinds of food.The Wild Food Festival,in the town of Hokitika,the west of Coast of New Zealand,gives you the chance to try some strange food.It is a celebration of the areas special lifestyle and food.And it celebrates food that most people might not want to eat.It is held in March every year.\nAt the festival you will find huhu grubs and beetles on your plate.The festival also celebrates Maori food. the food of the traditional native people of New Nealand And visitors will eat the wild food with plenty of famous West Coast beer.What's more,there are three stages at the festival,where there is live music and entertainment an day long.\nIf you have the chance to travel to Hokitita during the Wild Food Festival,you should book a hotel before it begins.or you can choose to stay at local schools.A number of local schools become camping grounds over the weekend of the festival.You can also stay in Greymouth,because there are buses from Greymouth to the festival.",3,Which activity can't you do at the festival?,"[You can eat Maori food., You can enjoy live music and entertainment., You can drink West Coast beer., You can make Maori food by yourself.]"
87845,middle7687.txt,"When travelling.you are sure to try some exciting new kinds of food.The Wild Food Festival,in the town of Hokitika,the west of Coast of New Zealand,gives you the chance to try some strange food.It is a celebration of the areas special lifestyle and food.And it celebrates food that most people might not want to eat.It is held in March every year.\nAt the festival you will find huhu grubs and beetles on your plate.The festival also celebrates Maori food. the food of the traditional native people of New Nealand And visitors will eat the wild food with plenty of famous West Coast beer.What's more,there are three stages at the festival,where there is live music and entertainment an day long.\nIf you have the chance to travel to Hokitita during the Wild Food Festival,you should book a hotel before it begins.or you can choose to stay at local schools.A number of local schools become camping grounds over the weekend of the festival.You can also stay in Greymouth,because there are buses from Greymouth to the festival.",3,Which place can't you stay during the festival?,"[A hotel, Local schools, In Greymouth, An exciting place.]"
87850,middle5460.txt,"In the days following the first anniversary of my dad 's death, I felt sad. I still missed him so much. I would walk over to his old house where my daughter now live and still expect to see him sitting out on his front porch . I would see him in my dreams, looking young and strong again.\nI felt bad too because I regretted not spending more time with him while he was here. I wished I had visited him more, talked with him more, and listened to him more. I wished I had asked the questions that I never did. I wished I had given him more hugs and told him I loved him more often. I wished I had cherished the time I had with him here. I was angry at the destiny because Dad and Mum had both passed away before I was 50, and I was angry with myself because I hadn't made the most of time I had with them.\nToday, I found myself walking under a cloudy sky, staring at the leafless trees. As I walked along, one of the last golden maple leaves fell from its tree and sailed in the wind until it landed in front of me. I picked it up and smiled. At the moment of peace I felt a sound in my heart and mind: ""death is momentary , but love is eternal ."" I knew at that moment that my relationships with Mum and Dad were not over. We would see each other again in a place of love, joy and light. We would spend our days in learning, laughter and love.",1,What is the main idea of this passage?,"[The writer's parents had both died before he was50., Life is limited but love is forever., The writer regretted that he had no more time to stay with his parents., The writer felt sad because he couldn't stay with his dad any more.]"


In [83]:
train_df.almost.value_counts()

7      3892
6      3802
8      3675
1      3521
5      3518
3      3417
9      3018
4      2983
2      2724
10     2440
11     1752
12     1215
13      857
14      513
15      333
16      205
17      111
18       72
19       49
20       37
22       13
21       11
23       11
24        7
26        3
25        3
105       1
34        1
29        1
27        1
Name: almost, dtype: int64

In [16]:
race = load_dataset('race', 'all')
race_train = race['train'].to_pandas()
print(race_train.shape)
vc = race_train['example_id'].value_counts().to_frame()
vc.columns = ['Count']
vc = vc[vc.Count > 5]
race_train = race_train[race_train['example_id'].isin(vc.index)]
print(race_train.shape)

Reusing dataset race (/Users/Hasan/.cache/huggingface/datasets/race/all/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


(87866, 5)
(217, 5)


In [15]:
vc

Unnamed: 0,Count
middle7941.txt,7
high4205.txt,6
high19876.txt,6
high11655.txt,6
high6122.txt,6
...,...
high17887.txt,1
high38.txt,1
high24041.txt,1
high12657.txt,1
