In [None]:
import pandas as pd

Extract

Read in data for each school, specifying column types where necessary.
Rename columns.
Add column for source file.

School A

In [None]:
s_a = (
    pd.read_csv(
        "raw data/schoolA.csv",
        dtype={
            "StudentID": str,
            "SchoolID": str,
            "Mathematics_Score": pd.Int64Dtype(),
            "Reading_Score": pd.Int64Dtype(),
        },
    )
    .rename(
        columns={
            "StudentID": "Student_id",
            "SchoolID": "School_code",
            "Test_Date": "Assessment_date",
            "Mathematics_Score": "Mathematics_Score",
            "Mathematics_Met_Growth_Target": "Mathematics_Met_Growth_Target",
            "Reading_Score": "Reading_Score",
            "Reading_Met_Growth_Target": "Reading_Met_Growth_Target",
        }
    )
    .assign(source_file="schoolA.csv")
)

s_a.sample(n=3)

School B

In [None]:
s_b = (
    pd.read_csv(
        "raw data/schoolB.csv",
        dtype={
            "Student_ID": str,
            "SchoolID": str,
            "Mathematics_Score": pd.Int64Dtype(),
            "Reading_Score": pd.Int64Dtype(),
        },
    )
    .rename(
        columns={
            "Student_ID": "Student_id",
            "SchoolID": "School_code",
            "TestDate": "Assessment_date",
            "Mathematics_Score": "Mathematics_Score",
            "Mathematics_Met_Growth_Target": "Mathematics_Met_Growth_Target",
            "Reading_Score": "Reading_Score",
            "Reading_Met_Growth_Target": "Reading_Met_Growth_Target",
        }
    )
    .assign(source_file="schoolB.csv")
)

s_b.sample(n=3)

School C

In [None]:
s_c = (
    pd.read_csv(
        "raw data/schoolC.csv",
        dtype={
            "StudentID": str,
            "SchoolID": str,
            "Mathematics_Score": pd.Int64Dtype(),
            "Reading_Score": pd.Int64Dtype(),
        },
    )
    .rename(
        columns={
            "StudentID": "Student_id",
            "SchoolID": "School_code",
            "Test_Date": "Assessment_date",
            "Mathematics_Score": "Mathematics_Score",
            "Mathematics_Met_Growth_Target": "Mathematics_Met_Growth_Target",
            "Reading_Score": "Reading_Score",
            "Reading_Met_Growth_Target": "Reading_Met_Growth_Target",
        }
    )
    .assign(source_file="schoolC.csv")
)

s_c.sample(n=3)

School D

In [None]:
s_d = (
    pd.read_csv(
        "raw data/schoolD.csv",
        dtype={
            "StudentID": str,
            "SchoolID": str,
            "Mathematics_Score": pd.Int64Dtype(),
            "Reading_Score": pd.Int64Dtype(),
        },
    )
    .rename(
        columns={
            "StudentID": "Student_id",
            "SchoolID": "School_code",
            "Test_Date": "Assessment_date",
            "Mathematics_Score": "Mathematics_Score",
            "Mathematics_Met_Growth_Target": "Mathematics_Met_Growth_Target",
            "Reading_Score": "Reading_Score",
            "Reading_Met_Growth_Target": "Reading_Met_Growth_Target",
        }
    )
    .assign(source_file="schoolD.csv")
)

s_d.sample(n=3)

School E

In [None]:
s_e = (
    pd.read_table(
        "raw data/schoolE.csv",
        dtype={
            "StudentID": str,
            "SchoolID": str,
            "Mathematics_Score": pd.Int64Dtype(),
            "Reading_Score": pd.Int64Dtype(),
        },
    )
    .rename(
        columns={
            "StudentID": "Student_id",
            "SchoolID": "School_code",
            "Test_Date": "Assessment_date",
            "Mathematics_Score": "Mathematics_Score",
            "Mathematics_Met_Growth_Target": "Mathematics_Met_Growth_Target",
            "Reading_Score": "Reading_Score",
            "Reading_Met_Growth_Target": "Reading_Met_Growth_Target",
        }
    )
    .assign(source_file="schoolE.csv")
)

s_e.sample(n=3)

Concatenate into single table.

In [None]:
s_concat = pd.concat([s_a, s_b, s_c, s_d, s_e]).loc[
    :,
    [
        "Student_id",
        "School_code",
        "Assessment_date",
        "Mathematics_Score",
        "Mathematics_Met_Growth_Target",
        "Reading_Score",
        "Reading_Met_Growth_Target",
        "source_file",
    ],
]

Transform

Pad student id and school id with zeros on left for consistency.
Standardize assessment dates.
Recode values for met growth target to boolean.

In [None]:
s_concat["Student_id"] = s_concat["Student_id"].str.pad(
    width=10, side="left", fillchar="0"
)
s_concat["School_code"] = s_concat["School_code"].str.pad(
    width=6, side="left", fillchar="0"
)

s_concat["Assessment_date"] = pd.to_datetime(
    s_concat["Assessment_date"], format="mixed"
)

condition_metgrowth = {"Yes": True, "Yes*": True, "No": False, "No*": False}
s_concat["Mathematics_Met_Growth_Target"] = s_concat[
    "Mathematics_Met_Growth_Target"
].map(condition_metgrowth)
s_concat["Reading_Met_Growth_Target"] = s_concat["Reading_Met_Growth_Target"].map(
    condition_metgrowth
)
s_concat = s_concat.astype(
    {"Mathematics_Met_Growth_Target": bool, "Reading_Met_Growth_Target": bool}
)

Verify data types.

In [None]:
s_concat.dtypes

Load

Write to csv.

In [None]:
s_concat.to_csv("schoolsAll.csv", index=False)

s_concat.sample(n=3)

Error File

Create error flags for null values.

In [None]:
err = s_concat.assign(
    student_isnull=lambda x: x.Student_id.isnull(),
    school_isnull=lambda x: x.School_code.isnull(),
    date_isnull=lambda x: x.Assessment_date.isnull(),
    math_isnull=lambda x: x.Mathematics_Score.isnull(),
    reading_isnull=lambda x: x.Reading_Score.isnull(),
)

err["is_error"] = (
    err.student_isnull
    | err.school_isnull
    | err.date_isnull
    | err.math_isnull
    | err.reading_isnull
)

Write to csv.

In [None]:
err.to_csv("error.csv", index=False)

err.sample(n=3)