In [51]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import pprint
import matplotlib.pyplot as plt
import yaml
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from IPython.display import Markdown, display

# utility function to print markdown string
def printmd(string):
    display(Markdown(string))

def read_params(config_path):
    with open(config_path) as yaml_file:
        config = yaml.safe_load(yaml_file)
    return config

def main(config_path, datasource):
    config = read_params(config_path)
    print(config)

In [52]:
default_config_path = os.path.join('..',"config", 'params.yaml')

default_config_path

'../config/params.yaml'

In [53]:
config = read_params(default_config_path)

In [54]:
config['data_preparation']['train_data_path']

'data/raw/train/train.csv'

In [55]:
df = pd.read_csv(os.path.join('..',config['data_preparation']['train_data_path']))


In [56]:
df.head()

Unnamed: 0,sid,sid_type,first_coop_code,first_dist_code,first_hs_code,first_dist_name,first_hs_name,first_hs_alt,first_hs_urbanicity,chrt_ninth,...,ihe_retention_lt_4_yr_part_time,ihe_federal_loan_rate,ihe_share_25_older,ihe_med_debt_completers_all,ihe_med_debt_completers_pmts,ihe_ihe_repay_3_yr_all,ihe_rate_4_yr,ihe_rate_lt_4_yr,ihe_med_earn_10_yrs_after,ihe_pct_earn_gt_25k_6_yrs_after
0,83596,Fake record,GRREC,169,1514,Falcon,Falcon,0,Rural: Distant,2009,...,0.4522,0.2661,0.3929,9740.0,108.13397,0.556974,,0.398137,25000.0,0.44169
1,28878,Fake record,OVEC,217,2770,Stadium,Watercress,0,Rural: Fringe,2009,...,,0.4597,0.0824,20500.0,227.59203,0.832144,0.604257,,41500.0,0.681255
2,41365,Fake record,KVEC,589,8428,Barton,Independence,0,Rural: Remote,2009,...,0.4203,0.2813,0.4498,12654.0,140.48534,0.423154,,0.210462,23000.0,0.396902
3,88573,Fake record,CKEC,262,3820,Oceanview,Burton,0,City: Large,2009,...,,0.542,0.1831,23275.0,258.40021,0.777563,0.392686,,35500.0,0.584031
4,67264,Fake record,CKEC,262,3830,Oceanview,Graham,0,City: Large,2009,...,,,,,,,,,,


In [57]:
dim = df.shape
printmd(f"Dataset Dimension: **{df.shape[0]}** rows,  **{df.shape[1]}** columns")

Dataset Dimension: **42353** rows,  **68** columns

In [58]:
df.dtypes

sid                                  int64
sid_type                            object
first_coop_code                     object
first_dist_code                      int64
first_hs_code                        int64
                                    ...   
ihe_ihe_repay_3_yr_all             float64
ihe_rate_4_yr                      float64
ihe_rate_lt_4_yr                   float64
ihe_med_earn_10_yrs_after          float64
ihe_pct_earn_gt_25k_6_yrs_after    float64
Length: 68, dtype: object

In [59]:
df.isna().sum()

sid                                    0
sid_type                               0
first_coop_code                        0
first_dist_code                        0
first_hs_code                          0
                                   ...  
ihe_ihe_repay_3_yr_all             25455
ihe_rate_4_yr                      31503
ihe_rate_lt_4_yr                   36305
ihe_med_earn_10_yrs_after          25455
ihe_pct_earn_gt_25k_6_yrs_after    25455
Length: 68, dtype: int64

In [60]:
df.loc[1000] = df.loc[1].copy()

def find_duplicates(data):
    total_obs = len(data.index)
    uniq_obs = len(data.drop_duplicates())
    dups = total_obs - uniq_obs
    if total_obs == uniq_obs:
        printmd("**No duplicates Found!**")
        printmd(f'Total Obs:  **{total_obs}**')
        printmd(f'Unique Obs:  **{uniq_obs}**')
    else:
        printmd(f'**Duplicates Found:**  **{dups}**')
        dups_list = data.loc[data.duplicated(keep = False)]
        return dups_list
        

a = find_duplicates(df)

**Duplicates Found:**  **1**

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42353 entries, 0 to 42352
Data columns (total 68 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   sid                              42353 non-null  int64  
 1   sid_type                         42353 non-null  object 
 2   first_coop_code                  42353 non-null  object 
 3   first_dist_code                  42353 non-null  int64  
 4   first_hs_code                    42353 non-null  int64  
 5   first_dist_name                  42353 non-null  object 
 6   first_hs_name                    42353 non-null  object 
 7   first_hs_alt                     42353 non-null  int64  
 8   first_hs_urbanicity              41977 non-null  object 
 9   chrt_ninth                       42353 non-null  int64  
 10  male                             42345 non-null  float64
 11  race_ethnicity                   41822 non-null  object 
 12  frpl              

In [62]:
printmd("**Unique Values for each feature in the data set**")
df.nunique().T

**Unique Values for each feature in the data set**

sid                                42352
sid_type                               1
first_coop_code                        9
first_dist_code                      171
first_hs_code                        389
                                   ...  
ihe_ihe_repay_3_yr_all                60
ihe_rate_4_yr                         41
ihe_rate_lt_4_yr                      20
ihe_med_earn_10_yrs_after             54
ihe_pct_earn_gt_25k_6_yrs_after       60
Length: 68, dtype: int64

In [63]:
df.describe(include = ['object']).T

Unnamed: 0,count,unique,top,freq
sid_type,42353,1,Fake record,42353
first_coop_code,42353,9,GRREC,7403
first_dist_name,42353,171,Snowy Hill,6533
first_hs_name,42353,389,Sylvan,560
first_hs_urbanicity,41977,11,Rural: Fringe,9286
race_ethnicity,41822,5,White,33904
ihe_name_yr1,17952,63,Cardinal University,1768
ihe_barrons_rank_2013,10569,6,Competitive,5094
ihe_degrees_awarded_predominant,16899,3,Predominantly bachelor's-degree granting,10668
ihe_degrees_awarded_highest,16899,3,Graduate degree,10408


In [64]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sid,42353.0,55731.661606,32336.762139,5.0,27702.0,55651.0,83620.0,111990.0
first_dist_code,42353.0,379.983874,163.513783,103.0,238.0,388.0,538.0,613.0
first_hs_code,42353.0,5259.449484,2785.588361,1106.0,2850.0,5294.0,7492.0,9902.0
first_hs_alt,42353.0,0.053408,0.224849,0.0,0.0,0.0,0.0,1.0
chrt_ninth,42353.0,2009.0,0.0,2009.0,2009.0,2009.0,2009.0,2009.0
male,42345.0,0.519636,0.49962,0.0,0.0,1.0,1.0,1.0
frpl,41796.0,0.604149,0.489039,0.0,0.0,1.0,1.0,1.0
iep,42353.0,0.112247,0.315674,0.0,0.0,0.0,0.0,1.0
ell,42353.0,0.015701,0.124319,0.0,0.0,0.0,0.0,1.0
gifted,42353.0,0.190022,0.392323,0.0,0.0,0.0,0.0,1.0


In [65]:
categorical = df.select_dtypes('object')
for c in categorical:
    print(f'Categories in {c} variable:     ')
    print(df[c].unique())

Categories in sid_type variable:     
['Fake record']
Categories in first_coop_code variable:     
['GRREC' 'OVEC' 'KVEC' 'CKEC' 'SESC' 'Jeffco' 'WKEC' 'NKCES' 'KEDC']
Categories in first_dist_name variable:     
['Falcon' 'Stadium' 'Barton' 'Oceanview' 'Herald Square' 'Sandpiper'
 'Floyd' 'Williams' 'Horan' 'Orozco' 'Snowy Hill' 'Englewood' 'Marino'
 'Providence' 'New Beacon' 'Jackson' 'Foster' 'Oriole' 'Highland'
 'Halcyon' 'Brookfield' 'Sage' 'Wrigley' 'Garner' 'Astro' 'Plessy' 'Felix'
 'Quigley' 'Marsden' 'Hyde' 'Birdseye' 'Adolphus' 'Sterling' 'Orange'
 'Aristides' 'Majestic' 'Zirvas' 'Stony Brae' 'Isaac Newton' 'Nobscot'
 'Common Way' 'Village' 'Lark Valley' 'Goodacre' 'Baker' 'Labyrinth'
 'McAfee' 'Midlands' 'Countryside' 'Elmwood' 'Wilson' 'Oak Tree' 'Xavier'
 'Athelstane' 'Franklin' 'Houghton' 'Jubilee' 'Sabre' 'Astoria' 'Arbor'
 'Heatherland' 'Reed' 'Charles' 'Coleman' 'Columbia' 'Kingfisher'
 'Thunder Gulch' 'Marshall' 'North Star' 'Clayton' 'Lang' 'Wayland'
 'Hercules' 'Hob

In [40]:
import os
from pathlib import Path
import streamlit as st
from typing import Dict
from typing import List
from typing import Text

def get_report_name(path: Path) -> Text:
    """Convert report path to human readable name.

    Args:
        path (Path): Report path.

    Returns:
        Text: human readable name.
    """

    name: Text = path.with_suffix("").name.replace("_", " ").capitalize()

    return name


def get_reports_mapping(period_dir: Text) -> Dict[Text, Path]:
    """Build dictionary where human readable names corresponds to paths.
    Note: each directory gets suffix ` (folder)`.

    Args:
        paths (List[Path]): List of paths.

    Returns:
        Dict[Text, Path]: Dictionary with structure:
        {
            <Name>: <path>
        }

    Examples:
    >>> paths = [
        'reports/2011-02-12_2011-02-18/data_quality',
        'reports/2011-02-12_2011-02-18/model_performance',
        'reports/2011-02-12_2011-02-18/data_drift.html',
        'reports/2011-02-12_2011-02-18/data_quality.html',
        'reports/2011-02-12_2011-02-18/model_performance.html',
        'reports/2011-02-12_2011-02-18/target_drift.html'
    ]
    >>> report_paths_to_names(paths)
    {
        'Data drift': 'Path(reports/2011-02-12_2011-02-18/data_drifts.html)',
        'Data quality(folder)': 'Path(reports/2011-02-12_2011-02-18/data_quality)',
        'Data quality': 'Path(reports/2011-02-12_2011-02-18/data_quality.html)',
        'Model performance (folder)': 'Path(reports/2011-02-12_2011-02-18/model_performance)',
        'Model performance': 'Path(reports/2011-02-12_2011-02-18/model_performance.html)',
        'Target drift': 'Path(reports/2011-02-12_2011-02-18/target_drift.html)'
    }
    """

    names: List[Text] = []
    paths: List[Path] = []
    
    for filename in os.listdir(period_dir):
        if not filename.startswith("."):
            paths.append(Path(f"{period_dir}/{filename}"))
    paths.sort()

    for path in paths:
        name: Text = get_report_name(path)
        if path.is_dir():
            name += " (folder)"
        names.append(name)

    return dict(zip(names, paths))

In [41]:
a = get_reports_mapping('../reports')


In [42]:
a['Data quality report']

PosixPath('../reports/data_quality_report.html')

In [None]:
def select_report(report_names: List[Text]) -> Text:
    """Select a report name from a selectbox.

    Args:
        report_names (List[Text]): Available report names.

    Raises:
        EntityNotFoundError: If report name list is empty.

    Returns:
        Text: Report name.
    """

    if not report_names:
        raise EntityNotFoundError("🔍 Reports not found")

    selected_report_name: Text = st.sidebar.selectbox(
        label="📈 Select report", options=report_names
    )

    return selected_report_name