# PISA Analysis 

## Technical Setup

In [20]:
import os 
import pandas as pd
from sas7bdat import SAS7BDAT
path = r"D:\Data Science Folder\PISA Analysis\Data\2018"

# Exporting the data to CSV
While Pandas can read the data, it was not possible to read it into memory. I use the package SAS7BDAT to export the files to CSV. They can afterwards be read as CSV normally. 

In [25]:
BasePath = r"D:\Data Science Folder\PISA Analysis\Data\2018" 
FilePath = ["01 - Student Questionnaire", 
            "02 - Cognitive Item", 
            "03 - Teacher Questionnaire", 
            "04 - School Questionnaire", 
            "05 - Questionnaire Timing"]
FileName = ["cy07_msu_stu_qqq.sas7bdat", 
            "cy07_msu_stu_cog.sas7bdat", 
            "cy07_msu_tch_qqq.sas7bdat", 
            "cy07_msu_sch_qqq.sas7bdat", 
            "cy07_msu_stu_tim.sas7bdat"
           ]
for i in range(0,len(FileName)):
    path = os.path.join(BasePath,FilePath[i],FileName[i])
    writepath = readpath.replace('sas7bdat','csv')
    reader = SAS7BDAT(path)
    filename = FileName[i].replace('sas7bdat','csv')
    path = os.path.join(BasePath,FilePath[i],filename)
    reader.convert_file(path)


SAS7BDAT file: cy07_msu_stu_qqq.sas7bdat
SAS7BDAT file: cy07_msu_stu_cog.sas7bdat
SAS7BDAT file: cy07_msu_tch_qqq.sas7bdat
SAS7BDAT file: cy07_msu_sch_qqq.sas7bdat
SAS7BDAT file: cy07_msu_stu_tim.sas7bdat


## Describing the data
The data is available at the official PISA website (link here) in two formats: SAS, and SPSS. I proceed with the SPSS data since it requires reading only one file. 


### The Codebook 
The first important file to download is the codebook. PISA data is collected in questionnaires, each item of which is codified. Both the variable name (column) and the values (entries) may need to be joined with the codebook to get their labels. 

In [None]:
def read_codebook(path = r"D:\Data Science Folder\PISA Analysis\Data\2018\PISA2018_CODEBOOK.xlsx"): 
    """ 
    Description: This function reads the codebook for PISA 2018. It may work for older versions of PISA assuming they follow the same format
    
    Inputs: 
        - path: the location where to find the codebook file in excel format.
    Outputs: 
        - Codebook: A DataFrame containing codes and labels of the variables and values
    """
    Sheets = pd.ExcelFile(path).sheet_names
    Codebook = pd.DataFrame()
    
    for sheet in Sheets:
        df = pd.read_excel(path, sheet_name = sheet)
        df.loc[:,'DataFile'] = sheet
        df = df.reset_index(drop = True)

        DataFrame = df[['VAL','LABEL']].dropna().reset_index()
        DataFrame.columns = ['index','Value', 'Val_Label']

        temp = df[['NAME', 'VARLABEL', 'DataFile']].dropna().reset_index()
        temp.loc[:,'repeats'] = (temp['index'].shift(-1)- temp['index']).fillna(max(temp['index']) - max(DataFrame['index']))
        DataFrame['Variable'] = temp['NAME'].repeat(temp['repeats']).reset_index(drop = True)
        DataFrame['Var_Label'] = temp['VARLABEL'].repeat(temp['repeats']).reset_index(drop = True)
        DataFrame['DataFile'] = temp['DataFile'].repeat(temp['repeats']).reset_index(drop = True)

        DataFrame = DataFrame.drop('index', axis =1)
        Codebook = Codebook.append(DataFrame)

    
    return Codebook

def get_varlabels(DataFrame, VarLabels):
    """
    Description: This function takes a dataframe and passes it through the codebook to get the variable labels. 
    
    Inputs: 
        - DataFrame: A pandas DataFrame containing the data for which the variable labels are needed
        - VarLabels: A pandas DataFrame containing the Codebook read variable labels. 
    """
    # Getting the variable labels

    left = VarLabels[VarLabels.ITEM == filename.replace('.sav','')][['NAME', 'VARLABEL']]
    right = pd.DataFrame({'NAME':DataFrame.columns})
    NewCols = pd.merge(left,right,how = 'inner')

    # Replacing the variable labels 
    DataFrame = DataFrame.loc[:,NewCols.NAME]
    DataFrame.columns = NewCols.VARLABEL
    
    return DataFrame

def ReadSASDataFile(
    filepath = r"D:\Data Science Folder\PISA Analysis\2018_Cognitive_Item_Data_Files\cy07_msu_stu_cog.sas7bdat",
    chunksize = 50000, 
    outpath = 'Exported',
    encoding = "Latin1"):
    
    import os
    import pandas as pd
    print("Filepath:", filepath)
    chunks = pd.read_sas(filepath,
                         encoding = encoding,
                         chunksize = 50000)
    basename = os.path.basename(filepath).split('.')[0]
    outpath =  filepath.replace(os.path.basename(filepath),outpath)
    if not os.path.exists(outpath):
        os.mkdir(outpath)
    

    CountryCodes = pd.Series()
    iteration = 0 
    for chunk in chunks: 
        CountryCodes = chunk.CNTRYID.drop_duplicates().reset_index(drop=True).astype(int).astype(str)
        iteration = iteration +1
        print("Working with chunk ", iteration)
        run = 0 
        for Country in CountryCodes:
            run = run + 1
            print("Working with Country ", run, " out of ", len(CountryCodes))
            filename = basename + "_" + Country +'.csv'
            filename = os.path.join(outpath,filename)
            try:
                df = pd.read_csv(filename, index_col = False)
            except:
                df = pd.DataFrame()
                (df.append(chunk[chunk.CNTRYID.astype(int).astype(str) == Country])
                 .to_csv(filename, 
                     index = False)
                )
            print("Completed all countries for this chunk. Moving to the next one.")
    print("Completed")

In [None]:
filename = "PISA2018_CODEBOOK.xlsx"
Codebook = read_codebook(os.path.join(path,filename))
Codebook.to_excel(r'.\Processed Data\Codebook.xlsx', index = False)
Codebook.head()

In [None]:
ReadSASDataFile(filepath = r'D:\Data Science Folder\PISA Analysis\Data\2018\05 - Questionnaire Timing\cy07_msu_stu_tim.sas7bdat')

In [None]:
ReadSASDataFile(filepath = r'D:\Data Science Folder\PISA Analysis\Data\2018\03 - Teacher Questionnaire\cy07_msu_tch_qqq.sas7bdat', 
               encoding = 'utf-8')

In [None]:
len(Codebook[(Codebook.Variable == "CNTRYID")&(Codebook.DataFile == "CY07_MSU_TCH_QQQ")])

In [None]:
len(Codebook[(Codebook.Variable == "CNTRYID")&(Codebook.DataFile == "CY07_MSU_TCH_QQQ")].Val_Label.drop_duplicates())

### The Questionnaire Data
The PISA data is composed of four questionnaires: 
 1. The Student Questionnaire: Students were asked to evaluate their knowledge and ability. This is just as much a measure of a student relative to their peers, as is of a teacher, a school, and a country relative to others. 
 2. The Student Cognitive Questionnaire: Students were also asked to complete a cognitive test. 
 3. The Teacher Questionnaire: Teachers were asked questions with the intent of describing the immediate environment where students are learning. 
 4. The School Questionnaire: School principals were also asked to describe the state of the school with the goal of measuring their ability to create an environment that foster learning. 


#### The Student Questionnaire: 
This data file is quite big (1.8 GB). 

In [None]:
file_directory = "01 - Student Questionnaire"
filename = "CY07_MSU_STU_QQQ.sav"
StuData = pd.read_spss(os.path.join(path, file_directory,filename))

#### The Cognitive Student Questionnaire: 
This data file is even bgiger (2.8 GB) 

In [None]:
file_directory = "02 - Cognitive Item"
filename = "CY07_MSU_STU_COG.sav"
CogData = pd.read_spss(os.path.join(path, file_directory,filename), convert_categoricals = False)



#### The Teacher Questionnaire: 
This data file is not very big (57 MB) 

In [None]:
file_directory = "03 - Teacher Questionnaire"
filename = "CY07_MSU_TCH_QQQ.sav"
TchData = pd.read_spss(os.path.join(path, file_directory,filename))

#### The School Questionnaire: 

This data file is small (10 MB) 

In [None]:
# Reading the file
file_directory = "04 - SChool Questionnaire"
filename = "CY07_MSU_SCH_QQQ.sav"
SchData = pd.read_spss(os.path.join(path, file_directory,filename))

SchData = get_varlabels(SchData, VarLabels)


In [None]:
SchData.columns.to_frame()[100:220]

In [None]:
SchData[SchData['Country Identifier'] == 'Dominican Republic'][['Intl. School ID']].drop_duplicates()

In [None]:
(SchData
 [SchData['Country Identifier'] == 'Dominican Republic']
 [["Which of the following definitions best describes the community in which your school is located?"]]
 .drop_duplicates())


In [None]:
(SchData
 [SchData['Country Identifier'] == 'Dominican Republic']
 .groupby("Which of the following definitions best describes the community in which your school is located?")
 [['Intl. School ID']]
 .agg('nunique')
)

In [None]:
(SchData
 [SchData['Country Identifier'] == 'Dominican Republic']
 .groupby("Is your school a public or a private school?")
 [['Intl. School ID']]
 .agg('nunique')
)

In [None]:
(SchData
 [(SchData['Country Identifier'] == 'Dominican Republic')
  &
  (SchData['Is your school a public or a private school?'].str.contains('private'))]
 .iloc[:,SchData.columns.str.startswith('Percentage of total funding')]
 .agg('mean')
)

In [None]:
(SchData
 [(SchData['Country Identifier'] == 'Dominican Republic')
  &
  (SchData['Is your school a public or a private school?'].str.contains('public'))]
 .iloc[:,SchData.columns.str.startswith("School's use of assessments of students:")]
 .melt()
 .reset_index()
 .groupby(['VARLABEL', 'value'])
 .agg('count')
 .reset_index()
 .pivot('VARLABEL','value','index')
)

In [None]:
import pyreadstat as spss

In [None]:
file_directory = "02 - Cognitive Item"
filename = "CY07_MSU_STU_COG.sav"
CogData = spss.read_sav(os.path.join(path, file_directory,filename), row_limit = 100)[0]
CogData.head()

In [None]:
filepath = r"D:\Data Science Folder\PISA Analysis\2018_Cognitive_Item_Data_Files\cy07_msu_stu_cog.sas7bdat"
import os
filename = os.path.basename(filepath).split('.')[0]

filename

In [None]:
    chunks = pd.read_sas(filepath,
                         encoding = 'Latin1',
                         chunksize = 50000)

In [None]:
ReadSASDataFile()