In [1]:
import pandas as pd
fp ="../../../data/sba_7a_loans.csv"
df = pd.read_csv(fp)

  df = pd.read_csv(fp)


In [2]:
df_raw_meta = pd.DataFrame.from_dict({k:v for k, v in df.dtypes.items()}, orient="index").reset_index()
df_raw_meta.columns = ["Attribute", "Inferred Type"]
df_raw_meta

Unnamed: 0,Attribute,Inferred Type
0,AsOfDate,int64
1,Program,object
2,BorrName,object
3,BorrStreet,object
4,BorrCity,object
5,BorrState,object
6,BorrZip,int64
7,BankName,object
8,BankFDICNumber,float64
9,BankNCUANumber,float64


In [3]:
subset_cols = ["BorrName", "BankFDICNumber", "BankZip", "BorrZip", "NaicsCode", "FranchiseCode",\
               "BusinessAge", "LoanStatus", "SBAGuaranteedApproval"]
df = df[subset_cols]

In [4]:
valid_loan_status_tags = ["PIF", "CHGOFF"]
df = df[df.LoanStatus.isin(valid_loan_status_tags)]

In [5]:
df_missing_vals = pd.DataFrame.from_dict({c:df[c].isnull().sum() for c in subset_cols if df[c].isnull().sum() > 0},\
                                         orient="index").reset_index()
df_missing_vals.columns = ["Attribute", "Missing Value Count"]
df_missing_vals

Unnamed: 0,Attribute,Missing Value Count
0,BankFDICNumber,2122
1,FranchiseCode,19923
2,BusinessAge,53


In [6]:
for a in df_missing_vals["Attribute"]:
    df[a] = df[a].fillna("Not Applicable")

In [7]:
{c:df[c].isnull().sum() for c in subset_cols if df[c].isnull().sum() > 0}

{}

In [8]:
df["BankFDICNumber"] = df["BankFDICNumber"].apply(lambda x: x if x == "Not Applicable" else int(x))
dtypes_toset = {"BorrZip": 'str', "BankZip": "str", "BankFDICNumber": 'str',\
                "NaicsCode": 'str', "FranchiseCode": 'str', \
                "BusinessAge" : 'str', "LoanStatus": 'str', "SBAGuaranteedApproval" : float}
df = df.astype(dtypes_toset)
                

In [9]:
df_catvars = pd.DataFrame.from_dict({k: df[k].nunique() for k, v in dtypes_toset.items() if v == 'str'}, orient="index").reset_index()
df_catvars.columns = ["Attribute", "Unique_Values"]
df_catvars

Unnamed: 0,Attribute,Unique_Values
0,BorrZip,9057
1,BankZip,1169
2,BankFDICNumber,1116
3,NaicsCode,860
4,FranchiseCode,979
5,BusinessAge,6
6,LoanStatus,2


In [10]:
df_catvars[df_catvars.Attribute != "LoanStatus"].Unique_Values.sum()

13187

In [11]:
df.LoanStatus.value_counts()

LoanStatus
PIF       21907
CHGOFF     1054
Name: count, dtype: int64

In [12]:
(df.LoanStatus.value_counts()["CHGOFF"]/df.LoanStatus.value_counts()["PIF"]).round(3)

0.048

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [14]:
fptrain = "../../../data/sba_7a_loans_train.parquet"
fptest = "../../../data/sba_7a_loans_test.parquet"
train.to_parquet(fptrain, index=False)
test.to_parquet(fptest, index=False)

In [15]:
from kmds.ontology.kmds_ontology import *
from kmds.tagging.tag_types import ExploratoryTags

kaw = KnowledgeExtractionExperimentationWorkflow("sba_7a_loan_chargeoff_modelling", namespace=onto)

In [16]:
exp_obs_list = []
observation_count :int = 1
e1 = ExploratoryObservation(namespace=onto)

In [17]:
e1.finding = "Only {lst} attributes from the raw data file are used in modeling".format(lst=subset_cols)
e1.finding_sequence = observation_count
e1.exploratory_observation_type = ExploratoryTags.RELEVANCE_OBSERVATION.value
exp_obs_list.append(e1)

In [18]:
observation_count += 1
e2 = ExploratoryObservation(namespace=onto)
e2.finding = "Attributes {lst} have missing values".format(lst=df_missing_vals["Attribute"].values.tolist())
e2.finding_sequence = observation_count
e2.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e2)

In [19]:
observation_count += 1
e3 = ExploratoryObservation(namespace=onto)
e3.finding = "Attributes with missing values are replaced with the string Not Applicable"
e3.finding_sequence = observation_count
e3.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e3)

In [20]:
observation_count += 1
e4 = ExploratoryObservation(namespace=onto)
e4.finding = "Cardinality of the categorical attribute space is very large, {cardval}, so need to use some kind of \
dimensionality reduction. Cannot one hot encode this size".format(cardval="13187")
e4.finding_sequence = observation_count
e4.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e4)

In [21]:
observation_count += 1
e4 = ExploratoryObservation(namespace=onto)
e4.finding = "The class attribute is imbalanced, charge offs are less than five percent ({val})".format(val=0.048)
e4.finding_sequence = observation_count
e4.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
exp_obs_list.append(e4)

In [22]:
observation_count += 1
e5 = ExploratoryObservation(namespace=onto)
e5.finding = "{val} percent of the data is kept for model evaluation, the rest is used for model building.".format(val=0.2)
e5.finding_sequence = observation_count
e5.exploratory_observation_type = ExploratoryTags.RELEVANCE_OBSERVATION.value
exp_obs_list.append(e5)

In [23]:
kaw.has_exploratory_observations = exp_obs_list
from owlready2 import *
KNOWLEDGE_BASE = "sba_7a_loan_chargeoff_modelling.xml"
onto.save(file=KNOWLEDGE_BASE, format="rdfxml")