In [1]:
import pandas as pd
fptrain = "../../../data/sba_7a_loans_train.parquet"
fptest = "../../../data/sba_7a_loans_test.parquet"
df_train = pd.read_parquet(fptrain)

In [2]:
df_train

Unnamed: 0,BorrName,BankFDICNumber,BankZip,BorrZip,NaicsCode,FranchiseCode,BusinessAge,LoanStatus,SBAGuaranteedApproval
0,Brothers Freight Management L,Not Applicable,87109,14580,484121.0,Not Applicable,Change of Ownership,PIF,3525000.0
1,EASY SPACE STORAGE LLC,58665,28403,65401,531130.0,Not Applicable,Change of Ownership,PIF,654750.0
2,H&W Endeavors Inc.,6560,43215,77493,449121.0,S0659,"Startup, Loan Funds will Open Business",PIF,150000.0
3,Imagine Technology Group LLC,4767,80202,85226,423420.0,Not Applicable,Existing or more than 2 years old,PIF,3052500.0
4,Zorn Fruherziehung LLC,33555,33880,2301,624410.0,Not Applicable,"Startup, Loan Funds will Open Business",PIF,187500.0
...,...,...,...,...,...,...,...,...,...
18363,ABC Wonderful LLC,23086,91801,90630,721110.0,Not Applicable,Existing or more than 2 years old,PIF,1800000.0
18364,Get After It LLC,13421,68506,80525,713940.0,S0024,Existing or more than 2 years old,PIF,57375.0
18365,BARGAINS &amp; BUNDLES LLC,18409,19808,8753,561990.0,Not Applicable,Existing or more than 2 years old,PIF,37500.0
18366,T&Z Logistics Inc,58158,60062,60008,484121.0,Not Applicable,Existing or more than 2 years old,PIF,330300.0


In [3]:
y_label_train = df_train["LoanStatus"].apply(lambda x: 0 if x == 'PIF' else 1)

In [4]:
df_train["SBAGuaranteedApproval"].describe()

count    1.836800e+04
mean     4.099987e+05
std      7.072141e+05
min      8.000000e+02
25%      2.754750e+04
50%      1.275000e+05
75%      4.302225e+05
max      4.500000e+06
Name: SBAGuaranteedApproval, dtype: float64

In [5]:
y_label_train

0        0
1        0
2        0
3        0
4        0
        ..
18363    0
18364    0
18365    0
18366    0
18367    0
Name: LoanStatus, Length: 18368, dtype: int64

In [6]:
pred_cols = df_train.columns.tolist()
pred_cols.remove("LoanStatus")
pred_cols.remove("SBAGuaranteedApproval")

In [7]:
pred_cols

['BorrName',
 'BankFDICNumber',
 'BankZip',
 'BorrZip',
 'NaicsCode',
 'FranchiseCode',
 'BusinessAge']

In [8]:
df_train["LoanStatus"]

0        PIF
1        PIF
2        PIF
3        PIF
4        PIF
        ... 
18363    PIF
18364    PIF
18365    PIF
18366    PIF
18367    PIF
Name: LoanStatus, Length: 18368, dtype: object

In [9]:
df_pred_train = df_train[pred_cols]
df_pred_train = df_pred_train.astype({c: "str" for c in pred_cols})
df_test = pd.read_parquet(fptest)
y_label_test = df_test["LoanStatus"].apply(lambda x: 0 if x == 'PIF' else 1)
df_pred_test = df_test[pred_cols]
df_pred_test = df_pred_test.astype({c: "str" for c in pred_cols})

### Note: see [this documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html#:~:text=Implements%20feature%20hashing%2C%20aka%20the,column%20corresponding%20to%20a%20name.) 
for how to set up the input for feature hasher

In [10]:
feat_list_train = []
for index, row in df_pred_train.iterrows():
    arow = []
    for c in pred_cols:
        arow.append(row[c])
    feat_list_train.append(arow)
    

In [11]:
feat_list_test = []
for index, row in df_pred_test.iterrows():
    arow = []
    for c in pred_cols:
        arow.append(row[c])
    feat_list_test.append(arow)
    

In [12]:
del df_pred_train, df_pred_test

In [13]:
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=1024, input_type="string")
train_feature_mat = h.transform(feat_list_train)
test_feature_mat = h.transform(feat_list_test)

In [14]:
df_feat_train = pd.DataFrame.sparse.from_spmatrix(train_feature_mat)

df_feat_test = pd.DataFrame.sparse.from_spmatrix(test_feature_mat)


In [15]:
type(test_feature_mat)

scipy.sparse._csr.csr_matrix

In [16]:
import scipy as sp
fp_feat_train = "../../../data/sba_7a_loans_train_feat.npz"
fp_feat_test =  "../../../data/sba_7a_loans_test_feat.npz"
sp.sparse.save_npz(fp_feat_train, train_feature_mat)
sp.sparse.save_npz(fp_feat_test, test_feature_mat)


In [17]:
fp_label_train = "../../../data/sba_7a_loans_train_labels.parquet"
df_label_train = pd.DataFrame(y_label_train)
df_label_train.to_parquet(fp_label_train, index=False)

In [18]:
fp_label_test = "../../../data/sba_7a_loans_test_labels.parquet"
df_label_test = pd.DataFrame(y_label_test)
df_label_test.to_parquet(fp_label_test, index=False)

In [19]:

from kmds.tagging.tag_types import DataRepresentationTags
from owlready2 import *
from kmds.utils.load_utils import *
from kmds.utils.path_utils import *
KNOWLEDGE_BASE = "sba_7a_loan_chargeoff_modelling.xml"

In [20]:
onto2 = load_kb(KNOWLEDGE_BASE)

with onto2:
    insts = Workflow.instances()

the_workflow_instance = insts[0]

In [21]:
dr_obs_list = []
observation_count = 1

dr1 = DataRepresentationObservation(namespace=onto2)
dr1.finding = "Feature Hashing is is used for dimensionality reduction. See the link in the data representation notebook\
for information about setting up the input for feature hashing"
dr1.finding_sequence = observation_count
dr1.data_representation_observation_type = DataRepresentationTags.FEATURE_ENGG_OBSERVATION.value
dr_obs_list.append(dr1)

observation_count += 1
dr2 = DataRepresentationObservation(namespace=onto2)
dr2.finding = "The feature dimension used for this exercise is 1024, this is a hyper-parameter."
dr2.finding_sequence = observation_count
dr2.data_representation_observation_type = DataRepresentationTags.FEATURE_ENGG_OBSERVATION.value
dr_obs_list.append(dr2)




In [22]:
the_workflow_instance.has_data_representation_observations = dr_obs_list
onto2.save(file=KNOWLEDGE_BASE, format="rdfxml")