# PISA Analysis: Data Extraction

## Technical Setup

In [2]:
import os 
import pandas as pd
from sas7bdat import SAS7BDAT
path = r"D:\Data Science Folder\PISA Analysis\Data\2018"

# Exporting the data to CSV
While Pandas can read the data, it was not possible to read it into memory. I use the package SAS7BDAT to export the files to CSV. They can afterwards be read as CSV normally. 

In [9]:
ReadPath = r"D:\Data Science Folder\PISA Analysis\Data" 
Folder = ["01 - Student Questionnaire", 
          "02 - Cognitive Item", 
          "03 - Teacher Questionnaire", 
          "04 - School Questionnaire", 
          "05 - Questionnaire Timing"]
FileName = ["cy07_msu_stu_qqq.sas7bdat", 
            "cy07_msu_stu_cog.sas7bdat", 
            "cy07_msu_tch_qqq.sas7bdat", 
            "cy07_msu_sch_qqq.sas7bdat", 
            "cy07_msu_stu_tim.sas7bdat"]

DataYear = 2018
WritePath = r"D:\BigQueryDatabase\PISA" 
WritePath = os.path.join(WritePath,str(DataYear))
if not os.path.exists(WritePath):
    os.makedirs(WritePath)
    print('Created directory:', WritePath)
else:
    print('Directory', WritePath, 'already exists')

for i in range(0,len(FileName)):
    # read the file
    readpath = os.path.join(ReadPath,Folder[i],FileName[i])
    reader = SAS7BDAT(readpath)
    
    # convert to csv
    writepath = os.path.join(WritePath,FileName[i].replace('sas7bdat','csv'))
    reader.convert_file(writepath)

Directory D:\BigQueryDatabase\PISA\2018  already exists


## Describing the data
The data is available at the official PISA website (link here) in two formats: SAS, and SPSS. I proceed with the SPSS data since it requires reading only one file. 


### The Codebook 
The first important file to download is the codebook. PISA data is collected in questionnaires, each item of which is codified. Both the variable name (column) and the values (entries) may need to be joined with the codebook to get their labels. 

In [44]:
def read_codebook(path = r"D:\Data Science Folder\PISA Analysis\Data\2018\PISA2018_CODEBOOK.xlsx"): 
    """ 
    Description: This function reads the codebook for PISA 2018. It may work for older versions of PISA assuming they follow the same format
    
    Inputs: 
        - path: the location where to find the codebook file in excel format.
    Outputs: 
        - Codebook: A DataFrame containing codes and labels of the variables and values
    """
    Sheets = pd.ExcelFile(path).sheet_names
    Codebook = pd.DataFrame()
    
    for sheet in Sheets:
        df = pd.read_excel(path, sheet_name = sheet)
        df.loc[:,'DataFile'] = sheet
        df = df.reset_index(drop = True)

        DataFrame = df[['VAL','LABEL']].dropna().reset_index()
        DataFrame.columns = ['index','Value', 'Val_Label']

        temp = df[['NAME', 'VARLABEL', 'DataFile']].dropna().reset_index()
        temp.loc[:,'repeats'] = (temp['index'].shift(-1)- temp['index']).fillna(max(temp['index']) - max(DataFrame['index']))
        DataFrame['Variable'] = temp['NAME'].repeat(temp['repeats']).reset_index(drop = True)
        DataFrame['Var_Label'] = temp['VARLABEL'].repeat(temp['repeats']).reset_index(drop = True)
        DataFrame['DataFile'] = temp['DataFile'].repeat(temp['repeats']).reset_index(drop = True)

        DataFrame = DataFrame.drop('index', axis =1)
        Codebook = Codebook.append(DataFrame)

    
    return Codebook

def get_varlabels(DataFrame, VarLabels):
    """
    Description: This function takes a dataframe and passes it through the codebook to get the variable labels. 
    
    Inputs: 
        - DataFrame: A pandas DataFrame containing the data for which the variable labels are needed
        - VarLabels: A pandas DataFrame containing the Codebook read variable labels. 
    """
    # Getting the variable labels

    left = VarLabels[VarLabels.ITEM == filename.replace('.sav','')][['NAME', 'VARLABEL']]
    right = pd.DataFrame({'NAME':DataFrame.columns})
    NewCols = pd.merge(left,right,how = 'inner')

    # Replacing the variable labels 
    DataFrame = DataFrame.loc[:,NewCols.NAME]
    DataFrame.columns = NewCols.VARLABEL
    
    return DataFrame

def ReadSASDataFile(
    filepath = r"D:\Data Science Folder\PISA Analysis\2018_Cognitive_Item_Data_Files\cy07_msu_stu_cog.sas7bdat",
    chunksize = 50000, 
    outpath = 'Exported',
    encoding = "Latin1"):
    
    import os
    import pandas as pd
    print("Filepath:", filepath)
    chunks = pd.read_sas(filepath,
                         encoding = encoding,
                         chunksize = 50000)
    basename = os.path.basename(filepath).split('.')[0]
    outpath =  filepath.replace(os.path.basename(filepath),outpath)
    if not os.path.exists(outpath):
        os.mkdir(outpath)
    

    CountryCodes = pd.Series()
    iteration = 0 
    for chunk in chunks: 
        CountryCodes = chunk.CNTRYID.drop_duplicates().reset_index(drop=True).astype(int).astype(str)
        iteration = iteration +1
        print("Working with chunk ", iteration)
        run = 0 
        for Country in CountryCodes:
            run = run + 1
            print("Working with Country ", run, " out of ", len(CountryCodes))
            filename = basename + "_" + Country +'.csv'
            filename = os.path.join(outpath,filename)
            try:
                df = pd.read_csv(filename, index_col = False)
            except:
                df = pd.DataFrame()
                (df.append(chunk[chunk.CNTRYID.astype(int).astype(str) == Country])
                 .to_csv(filename, 
                     index = False)
                )
            print("Completed all countries for this chunk. Moving to the next one.")
    print("Completed")

In [None]:
filename = "PISA2018_CODEBOOK.xlsx"
Codebook = read_codebook(os.path.join(path,filename))
Codebook.to_excel(r'.\Processed Data\Codebook.xlsx', index = False)
Codebook.head()

In [None]:
file_directory = "01 - Student Questionnaire"
filename = "CY07_MSU_STU_QQQ.sav"
StuData = pd.read_spss(os.path.join(path, file_directory,filename))