In [1]:
# Import the required modules.
import pandas as pd

In [2]:
# Read the data.
df = pd.read_csv('../data/medquad.csv')

In [3]:
# Show the head of df.
df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [4]:
# Show the shape of the data.
df.shape

(16412, 4)

In [5]:
# Show the counts of the data.
df.source.value_counts()

source
GHR                  5430
GARD                 5394
NIDDK                1192
NINDS                1088
MPlusHealthTopics     981
NIHSeniorHealth       769
CancerGov             729
NHLBI                 559
CDC                   270
Name: count, dtype: int64

In [6]:
# Remove unwanted spaces in the columns.
df.question = df.question.str.replace(' ?', '?')

In [7]:
# Replace double question marks.
df.question = df.question.str.replace('??', '?')

In [8]:
# Show the shape df.
df.shape

(16412, 4)

In [9]:
# Check for null values.
df.isna().sum()

question       0
answer         5
source         0
focus_area    14
dtype: int64

In [10]:
# Drop null values.
df.dropna(inplace=True)

In [11]:
# Show the shape df.
df.shape

(16393, 4)

In [12]:
# Find the list of the sources in df.
source_list = list(set(df.source))

In [13]:
# Create an empty list to store the index.
empty_list = []

In [14]:
# Create the dataframe of the selected indices.
for item in source_list:
    index_list = df[df.source==item].sample(n=22, random_state=2).index.to_list()

    for num in index_list:
        empty_list.append(num)

empty_list.sort()
df_index = df.loc[empty_list]

In [15]:
# Show the head of df_index.
df_index.head()

Unnamed: 0,question,answer,source,focus_area
137,What is (are) Knee Replacement?,There are many different types and designs of ...,NIHSeniorHealth,Knee Replacement
182,What are the symptoms of Prostate Cancer?,"- a need to urinate frequently, especially at ...",NIHSeniorHealth,Prostate Cancer
213,How to diagnose Osteoporosis?,Who Should Be Tested? The United States Preven...,NIHSeniorHealth,Osteoporosis
231,How to prevent Kidney Disease?,Risk Factors Diabetes and high blood pressure ...,NIHSeniorHealth,Kidney Disease
232,What are the symptoms of Kidney Disease?,Kidney Disease Kidney disease is often called ...,NIHSeniorHealth,Kidney Disease


In [16]:
# Check the length of df_index.
df_index.shape

(198, 4)

In [17]:
# Re-index df_index.
df_index = df_index.reset_index(drop=True)

In [18]:
# Create an id column with the index.
id_column = list(range(len(df_index)))
df_index.loc[:, 'id'] = id_column

In [19]:
# Show the head of df_index.
df_index.head()

Unnamed: 0,question,answer,source,focus_area,id
0,What is (are) Knee Replacement?,There are many different types and designs of ...,NIHSeniorHealth,Knee Replacement,0
1,What are the symptoms of Prostate Cancer?,"- a need to urinate frequently, especially at ...",NIHSeniorHealth,Prostate Cancer,1
2,How to diagnose Osteoporosis?,Who Should Be Tested? The United States Preven...,NIHSeniorHealth,Osteoporosis,2
3,How to prevent Kidney Disease?,Risk Factors Diabetes and high blood pressure ...,NIHSeniorHealth,Kidney Disease,3
4,What are the symptoms of Kidney Disease?,Kidney Disease Kidney disease is often called ...,NIHSeniorHealth,Kidney Disease,4


In [20]:
# Create df_data dataframe.
df_data = df_index[['id', 'answer', 'source', 'focus_area']]

In [21]:
# Save df_data as data.csv.
df_data.to_csv('../data/data.csv', index=False)