In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import pprint
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from IPython.display import Markdown, display

# utility function to print markdown string
def printmd(string):
    display(Markdown(string))

def read_params(config_path):
    with open(config_path) as yaml_file:
        config = yaml.safe_load(yaml_file)
    return config

def main(config_path, datasource):
    config = read_params(config_path)
    print(config)

In [48]:
default_config_path = os.path.join('..',"config", 'params.yaml')

default_config_path

'../config/params.yaml'

In [49]:
config = read_params(default_config_path)

In [50]:
config['data_preparation']['train_data_path']

'data/raw/train/train.csv'

In [53]:
df = pd.read_csv(os.path.join('..',config['data_preparation']['train_data_path']))


In [54]:
df.head()

Unnamed: 0,sid,school_year,school_code,s_male,s_birth_year,s_race,s_frpl,s_ell,s_iep,s_grade_level,...,std_scaled_e,sat_math_score,sat_verbal_score,sat_writing_score,hs_diploma,hs_diploma_year,sch_charter,sch_alternative,sch_vocational,record_type
0,110745,2015,246,0,2002,Black,Free Lunch,0.0,0.0,6,...,0.373991,,,,0,,1,0,0,Fake data
1,109296,2011,173,1,2004,Black,Does Not Qualify,0.0,1.0,1,...,,,,,0,,0,0,0,Fake data
2,129402,2014,269,0,2003,White,Does Not Qualify,0.0,0.0,5,...,3.18794,,,,0,,0,0,0,Fake data
3,236260,2016,331,0,2002,White,Reduced Lunch,0.0,0.0,8,...,1.397873,,,,0,,0,0,0,Fake data
4,225007,2016,167,0,2004,Black,Reduced Lunch,0.0,0.0,6,...,-0.255119,,,,0,,0,0,0,Fake data


In [55]:
dim = df.shape
printmd(f"Dataset Dimension: **{df.shape[0]}** rows,  **{df.shape[1]}** columns")

Dataset Dimension: **607616** rows,  **23** columns

In [56]:
df.dtypes

sid                    int64
school_year            int64
school_code            int64
s_male                 int64
s_birth_year           int64
s_race                object
s_frpl                object
s_ell                float64
s_iep                float64
s_grade_level         object
s_days_absent        float64
s_days_suspended     float64
std_scaled_m         float64
std_scaled_e         float64
sat_math_score       float64
sat_verbal_score     float64
sat_writing_score    float64
hs_diploma             int64
hs_diploma_year      float64
sch_charter            int64
sch_alternative        int64
sch_vocational         int64
record_type           object
dtype: object

In [57]:
df.isna().sum()

sid                       0
school_year               0
school_code               0
s_male                    0
s_birth_year              0
s_race                    0
s_frpl                13949
s_ell                 13949
s_iep                 13949
s_grade_level         13949
s_days_absent        209567
s_days_suspended      13949
std_scaled_m         213365
std_scaled_e         214791
sat_math_score       523770
sat_verbal_score     523770
sat_writing_score    523776
hs_diploma                0
hs_diploma_year      480518
sch_charter               0
sch_alternative           0
sch_vocational            0
record_type               0
dtype: int64

In [58]:
df.loc[1000] = df.loc[1].copy()

def find_duplicates(data):
    total_obs = len(data.index)
    uniq_obs = len(data.drop_duplicates())
    dups = total_obs - uniq_obs
    if total_obs == uniq_obs:
        printmd("**No duplicates Found!**")
        printmd(f'Total Obs:  **{total_obs}**')
        printmd(f'Unique Obs:  **{uniq_obs}**')
    else:
        printmd(f'**Duplicates Found:**  **{dups}**')
        dups_list = data.loc[data.duplicated(keep = False)]
        return dups_list
        

a = find_duplicates(df)

**Duplicates Found:**  **1**

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607616 entries, 0 to 607615
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   sid                607616 non-null  int64  
 1   school_year        607616 non-null  int64  
 2   school_code        607616 non-null  int64  
 3   s_male             607616 non-null  int64  
 4   s_birth_year       607616 non-null  int64  
 5   s_race             607616 non-null  object 
 6   s_frpl             593667 non-null  object 
 7   s_ell              593667 non-null  float64
 8   s_iep              593667 non-null  float64
 9   s_grade_level      593667 non-null  object 
 10  s_days_absent      398049 non-null  float64
 11  s_days_suspended   593667 non-null  float64
 12  std_scaled_m       394250 non-null  float64
 13  std_scaled_e       392824 non-null  float64
 14  sat_math_score     83846 non-null   float64
 15  sat_verbal_score   83846 non-null   float64
 16  sa

In [60]:
printmd("**Unique Values for each feature in the data set**")
df.nunique().T

**Unique Values for each feature in the data set**

sid                  192589
school_year               6
school_code             256
s_male                    2
s_birth_year             30
s_race                    7
s_frpl                    3
s_ell                     2
s_iep                     2
s_grade_level            14
s_days_absent           174
s_days_suspended         84
std_scaled_m           8601
std_scaled_e           8529
sat_math_score           61
sat_verbal_score         61
sat_writing_score        61
hs_diploma                2
hs_diploma_year           9
sch_charter               2
sch_alternative           2
sch_vocational            2
record_type               1
dtype: int64

In [61]:
df.describe(include = ['object']).T

Unnamed: 0,count,unique,top,freq
s_race,607616,7,White,310413
s_frpl,593667,3,Does Not Qualify,330309
s_grade_level,593667,14,9,53975
record_type,607616,1,Fake data,607616


In [73]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sid,607616.0,162467.840083,59219.795774,60001.0,111073.0,162355.5,213869.0,264947.0
school_year,607616.0,2013.520154,1.700986,2011.0,2012.0,2014.0,2015.0,2016.0
school_code,607616.0,274.790065,75.374538,151.0,209.0,274.0,336.0,408.0
s_male,607616.0,0.516357,0.499733,0.0,0.0,1.0,1.0,1.0
s_birth_year,607616.0,2001.169642,4.250984,1978.0,1998.0,2001.0,2004.0,2015.0
s_ell,593667.0,0.053705,0.225435,0.0,0.0,0.0,0.0,1.0
s_iep,593667.0,0.152112,0.35913,0.0,0.0,0.0,0.0,1.0
s_days_absent,398049.0,9.627119,11.064338,0.0,3.0,7.0,12.0,307.0
s_days_suspended,593667.0,0.72004,2.880334,0.0,0.0,0.0,0.0,193.0
std_scaled_m,394250.0,-0.008436,0.992855,-3.937318,-0.695382,-0.070736,0.617424,5.728167


In [72]:
categorical = df.select_dtypes('object')
for c in categorical:
    print(f'Categories in {c} variable:     ')
    print(df[c].unique())

Categories in s_race variable:     
['Black' 'White' 'Asian' 'Multiple' 'Hispanic' 'Native American' 'Other']
Categories in s_frpl variable:     
['Free Lunch' 'Does Not Qualify' 'Reduced Lunch' nan]
Categories in s_grade_level variable:     
['6' '1' '5' '8' 'K' '9' '7' '2' '11' '10' '3' '12' '4' nan 'Pre-K']
Categories in record_type variable:     
['Fake data']


In [40]:
import os
from pathlib import Path
import streamlit as st
from typing import Dict
from typing import List
from typing import Text

def get_report_name(path: Path) -> Text:
    """Convert report path to human readable name.

    Args:
        path (Path): Report path.

    Returns:
        Text: human readable name.
    """

    name: Text = path.with_suffix("").name.replace("_", " ").capitalize()

    return name


def get_reports_mapping(period_dir: Text) -> Dict[Text, Path]:
    """Build dictionary where human readable names corresponds to paths.
    Note: each directory gets suffix ` (folder)`.

    Args:
        paths (List[Path]): List of paths.

    Returns:
        Dict[Text, Path]: Dictionary with structure:
        {
            <Name>: <path>
        }

    Examples:
    >>> paths = [
        'reports/2011-02-12_2011-02-18/data_quality',
        'reports/2011-02-12_2011-02-18/model_performance',
        'reports/2011-02-12_2011-02-18/data_drift.html',
        'reports/2011-02-12_2011-02-18/data_quality.html',
        'reports/2011-02-12_2011-02-18/model_performance.html',
        'reports/2011-02-12_2011-02-18/target_drift.html'
    ]
    >>> report_paths_to_names(paths)
    {
        'Data drift': 'Path(reports/2011-02-12_2011-02-18/data_drifts.html)',
        'Data quality(folder)': 'Path(reports/2011-02-12_2011-02-18/data_quality)',
        'Data quality': 'Path(reports/2011-02-12_2011-02-18/data_quality.html)',
        'Model performance (folder)': 'Path(reports/2011-02-12_2011-02-18/model_performance)',
        'Model performance': 'Path(reports/2011-02-12_2011-02-18/model_performance.html)',
        'Target drift': 'Path(reports/2011-02-12_2011-02-18/target_drift.html)'
    }
    """

    names: List[Text] = []
    paths: List[Path] = []
    
    for filename in os.listdir(period_dir):
        if not filename.startswith("."):
            paths.append(Path(f"{period_dir}/{filename}"))
    paths.sort()

    for path in paths:
        name: Text = get_report_name(path)
        if path.is_dir():
            name += " (folder)"
        names.append(name)

    return dict(zip(names, paths))

In [41]:
a = get_reports_mapping('../reports')


In [42]:
a['Data quality report']

PosixPath('../reports/data_quality_report.html')

In [None]:
def select_report(report_names: List[Text]) -> Text:
    """Select a report name from a selectbox.

    Args:
        report_names (List[Text]): Available report names.

    Raises:
        EntityNotFoundError: If report name list is empty.

    Returns:
        Text: Report name.
    """

    if not report_names:
        raise EntityNotFoundError("🔍 Reports not found")

    selected_report_name: Text = st.sidebar.selectbox(
        label="📈 Select report", options=report_names
    )

    return selected_report_name