This workbook cleans up the data in a CSV called people_ppp and appends NAICS code information to it. The results are stored in a CSV called: 'all_loans_under_150k_max_info.csv'


 

In [None]:
#import dependencies
import pandas as pd
import numpy as np
from datetime import date, datetime
import os, sys 
import io
import ee
from functools import reduce


In [None]:
## set up pandas dataframe using set_options to warn when we are working on a copy instead of ooriginal dataframe
pd.set_option('mode.chained_assignment','warn')

In [None]:
# dataframe will display without being truncated
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 768)

In [None]:
# !pip install -U -q PyDrive

In [None]:
#mounting the google drive to access the files
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# command shows file path
# ! ls

In [None]:
n_path = "/content/drive/My Drive/data/naics-cat.csv"
nf_path = "/content/drive/My Drive/data/2017-NAICS-Descriptions.csv"

In [None]:
ndf = pd.read_csv( n_path, index_col=0, dtype={'cat':'string'}, na_values = ['<NA>', 'no information'], )
ndf.sample()

Unnamed: 0_level_0,cat,description,biz_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15,53,Real Estate Rental and Leasing,868526


In [None]:
# Read column names from file
cols = list(pd.read_csv(nf_path, nrows =1))
print(cols)

['Code', 'Title', 'Description']


In [None]:
# nf_df = pd.read_csv( nf_path, dtype={'Code':'string', 'Title':'string', 'Description':'string'}, na_values = ['<NA>', '.'], )
# nf_df.sample()

In [None]:
nf_df = pd.read_csv( nf_path, dtype={'Code':'string',},)
nf_df.sample()

Unnamed: 0,Code,Title,Description
362,31214,DistilleriesT,See industry description for 312140.


In [None]:
nf_df['Description'] = nf_df['Description'].fillna('No information')
nf_df.sample(5)

Unnamed: 0,Code,Title,Description
578,32721,Glass and Glass Product ManufacturingT,This industry comprises establishments primari...
1842,6212,Offices of DentistsT,No information
13,11116,Rice FarmingT,See industry description for 111160.
621,331318,"Other Aluminum Rolling, Drawing, and Extruding",This U.S. industry comprises establishments pr...
186,221114,Solar Electric Power Generation,This U.S. industry comprises establishments pr...


In [None]:
nf_df.rename(columns={'Code': 'NAICS Code',}, inplace=True)
nf_df.sample()

Unnamed: 0,NAICS Code,Title,Description
1265,4821,Rail TransportationT,No information


In [None]:
nf_df.dtypes

NAICS Code     string
Title          object
Description    object
dtype: object

In [None]:
# df[weird]
for col in nf_df.columns:
    weird = (nf_df[[col]].applymap(type) != nf_df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(nf_df[weird]) > 0:
        print(col)


In [None]:
n_path = "/content/drive/My Drive/data/naics-cat.csv"

In [None]:
n_cats=pd.read_csv(n_path, )
n_cats

Unnamed: 0,id,cat,description,biz_count
0,1,11,Agriculture Forestry Fishing and Hunting,381477
1,2,21,Mining,32069
2,3,22,Utilities,46245
3,4,23,Construction,1490099
4,5,31,Manufacturing food related,215000
5,6,32,Manufacturing soft materials,215000
6,7,33,Manufacturing hard materials,215000
7,8,42,Wholesale Trade,697359
8,9,44,Retail Trade,80000
9,10,45,Retail Trade,80000


In [None]:
# filepath
csv_path = "/content/drive/My Drive/clean_ppp.csv"

# Read column names from file
cols = list(pd.read_csv(csv_path, nrows =1))
print(cols)


['Loan Amount', 'City', 'State', 'Zip', 'NAICS Code', 'Business Type', 'Race', 'Gender', 'Veteran', 'Jobs Saved', 'Date', 'Bank', 'NAICS Category']


In [None]:
#read the csv, create dataframe
# df = pd.read_csv(path,)
# df = pd.read_csv( path, na_values = ['no info', '.'], )
df = pd.read_csv(csv_path, index_col=False, dtype={'Jobs Saved':str, 'NAICS Code': str, 'Zip': str, "NAICS Category":str, }, parse_dates=[10])

In [None]:
df.shape

(4093555, 13)

In [None]:
# look for uneven datatypes results of cell will show Zip, Jobs Saved and Date
for col in df.columns:
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird]) > 0:
        print(col)

Date


In [None]:
df.sample()


Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
3663573,80843.0,LOS ANGELES,CA,90004,445291,Corporation,Unanswered,Unanswered,Unanswered,1,2020-05-03,Wells Fargo Bank,44


In [None]:
df['Jobs Saved'] = pd.to_numeric(df['Jobs Saved'], errors='coerce')

In [None]:
df['Date']= pd.to_datetime(df['Date'], infer_datetime_format=True,) 

In [None]:
# df['Date'].value_counts()

In [None]:
# look for uneven datatypes results of cell will show Zip, Jobs Saved and Date
for col in df.columns:
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird]) > 0:
        print(col)

Date


In [None]:
# df.loc[:,'NAICS Code'] = df.loc[:, 'NAICS Code'].astype(str)

In [None]:
# df.loc[:,'Zip'] = df.loc[:, 'Zip'].astype(str)

In [None]:
df['Jobs Saved'] = pd.to_numeric(df.loc[:,'Jobs Saved'],errors='coerce')

In [None]:
# df.loc[:,'Jobs Saved'] = df.loc[:, 'Jobs Saved'].str.strip('.0')

In [None]:
# df.loc[:,'Jobs Saved'] = df.loc[:, 'Jobs Saved'].astype(int)

In [None]:
#somewhat forcing the date conversion because there must be some values that are not in date format 
df.loc[:,'Date'] =  pd.to_datetime(df.loc[:,'Date'], errors='coerce', )

In [None]:
df.dtypes

Loan Amount              float64
City                      object
State                     object
Zip                       object
NAICS Code                object
Business Type             object
Race                      object
Gender                    object
Veteran                   object
Jobs Saved               float64
Date              datetime64[ns]
Bank                      object
NAICS Category            object
dtype: object

In [None]:
# check for nulls
print(df.isna().sum())

Loan Amount         0
City                0
State               0
Zip                 0
NAICS Code          0
Business Type       0
Race                0
Gender              0
Veteran             0
Jobs Saved         14
Date              467
Bank                0
NAICS Category      0
dtype: int64


In [None]:
df.dropna(inplace=True)
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
3068672,7740.63,HONOLULU,HI,96814,812112,Corporation,Unanswered,Unanswered,Unanswered,1.0,2020-05-26,Celtic Bank Corporation,81


In [None]:
print(df.isna().sum())

Loan Amount       0
City              0
State             0
Zip               0
NAICS Code        0
Business Type     0
Race              0
Gender            0
Veteran           0
Jobs Saved        0
Date              0
Bank              0
NAICS Category    0
dtype: int64


In [None]:
#verify that data types are consistent
for col in df.columns:
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird]) > 0:
        print(col)

In [None]:
# weird = (df.applymap(type) != df.iloc[0].apply(type)).any(axis=1)
# df[weird]

In [None]:
# #check for nulls
# print(df.isna().sum())


In [None]:
#verified
df.shape

(4093088, 13)

In [None]:
#take the first two letters of the NAICS code column and create category column. 
#category column can be matched to named values. The longer code was impractical
df['NAICS Category'] = df['NAICS Code'].str[:2].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4093088 entries, 0 to 4093554
Data columns (total 13 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Loan Amount     float64       
 1   City            object        
 2   State           object        
 3   Zip             object        
 4   NAICS Code      object        
 5   Business Type   object        
 6   Race            object        
 7   Gender          object        
 8   Veteran         object        
 9   Jobs Saved      float64       
 10  Date            datetime64[ns]
 11  Bank            object        
 12  NAICS Category  object        
dtypes: datetime64[ns](1), float64(2), object(10)
memory usage: 437.2+ MB


In [None]:
#create new column for NAICS code description
#take the first two letters of the NAICS code column and create category column. 
#category column can be matched to named values. The longer code was impractical
# df['NAICS Code Description'] = df['NAICS Code'].astype(str)
# df.info()

In [None]:
# df.sample()

In [None]:
banks=df.loc[:,'Bank'].value_counts()
banks

Bank of America            286970
JPMorgan Chase Bank        197840
Wells Fargo Bank           164726
Celtic Bank Corporation    138170
Cross River Bank           123385
                            ...  
Muskogee FCU                    2
Antioch Community FCU           2
Access CU                       2
Atascosa Bank                   2
Bank of New Cambria             2
Name: Bank, Length: 4805, dtype: int64

In [None]:
banks.sample()

First Federal Bank Littlefield    40
Name: Bank, dtype: int64

In [None]:
# df['Bank'].where(df['Bank'].apply(lambda x:(x.value_counts()))>=500, "other")

In [None]:
cond = df['Bank'].value_counts()
threshold = 500
other = np.where(df['Bank'].isin(cond.index[cond >= threshold ]), df['Bank'], 'miscellaneous')
other

array(['Country Club Bank', 'miscellaneous', 'miscellaneous', ...,
       'Bank of America', 'Bank of America', 'Wells Fargo Bank'],
      dtype=object)

In [None]:
# df['Bank'].value_counts()

In [None]:
# isit = df[df['Bank'] == '05/21/2020 ']
# isit

In [None]:
# df.sample()

In [None]:
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
253598,5100.0,ROCKFORD,IL,61114,23821,Corporation,Unanswered,Unanswered,Unanswered,1.0,2020-04-11,Illinois Bank & Trust,23


In [None]:
#  df= df.groupby('NAICS Category').filter(lambda x: len(x) >= 5)
df.dtypes

Loan Amount              float64
City                      object
State                     object
Zip                       object
NAICS Code                object
Business Type             object
Race                      object
Gender                    object
Veteran                   object
Jobs Saved               float64
Date              datetime64[ns]
Bank                      object
NAICS Category            object
dtype: object

In [None]:
# df.dropna(inplace=True)
df.head(1)

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
0,149961.0,KANSAS CITY,MO,64108,54199,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,13.0,2020-04-13,Country Club Bank,54


In [None]:
df.shape

(4093088, 13)

In [None]:
df['Jobs Saved'] = df.loc[:,'Jobs Saved'].astype(int)

In [None]:
nf_df.sample(5)


Unnamed: 0,NAICS Code,Title,Description
708,333132,Oil and Gas Field Machinery and Equipment Manu...,This U.S. industry comprises establishments pr...
1240,453998,All Other Miscellaneous Store Retailers (excep...,This U.S. industry comprises establishments pr...
299,311340,Nonchocolate Confectionery Manufacturing,This industry comprises establishments primari...
1859,621410,Family Planning Centers,This industry comprises establishments with me...
593,3274,Lime and Gypsum Product ManufacturingT,This industry group comprises establishments p...


In [None]:
mdf = pd.merge(left=df, right=nf_df, how='left', left_on='NAICS Code', right_on='NAICS Code')
mdf.sample(2)

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,Title,Description
227839,10238.34,PEORIA,IL,61614,81311,Non-Profit Organization,Unanswered,Unanswered,Unanswered,1,2020-04-28,Citizens Equity First CU,81,Religious Organizations,See industry description for 813110.
3468697,10400.0,SEVIERVILLE,TN,37876,445299,Corporation,Unanswered,Unanswered,Unanswered,1,2020-05-12,Citizens National Bank,44,All Other Specialty Food Stores,This U.S. industry comprises establishments pr...


In [None]:
mdf.shape

(4093088, 15)

In [None]:
# join n_path and df make merged df
# df = pd.merge(left=df, right=ndf, left_on='NAICS Category', right_on='cat', how='outer')
# df.sample(3)







In [None]:
# mdf=reduce(lambda x,y: pd.merge(x,y, on='NAICS Code', how='inner'), [df,  nf_df])


In [None]:
n_cats['cat']=n_cats['cat'].astype(str)

In [None]:
n_cats.sample()

Unnamed: 0,id,cat,description,biz_count
11,12,49,Transportation and Warehousing,290000


In [None]:
mdf.sample(2)

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,Title,Description
469918,121600.0,FORT PIERRE,SD,57532,44112,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,7,2020-04-06,BankWest,44,Used Car Dealers,See industry description for 441120.
1854577,48000.0,NEW YORK,NY,10018,541211,Limited Liability Company(LLC),Asian,Male Owned,Non-Veteran,3,2020-04-14,Citibank,54,Offices of Certified Public Accountants,This U.S. industry comprises establishments of...


In [None]:
# mdf.dtypes


In [None]:
# mdf=mdf.drop(['cat',], axis=1).copy()

In [None]:
mdf.dtypes

Loan Amount              float64
City                      object
State                     object
Zip                       object
NAICS Code                object
Business Type             object
Race                      object
Gender                    object
Veteran                   object
Jobs Saved                 int64
Date              datetime64[ns]
Bank                      object
NAICS Category            object
Title                     object
Description               object
dtype: object

In [None]:
mdf.shape


(4093088, 15)

In [None]:
print(mdf.isna().sum())

Loan Amount           0
City                  0
State                 0
Zip                   0
NAICS Code            0
Business Type         0
Race                  0
Gender                0
Veteran               0
Jobs Saved            0
Date                  0
Bank                  0
NAICS Category        0
Title             91272
Description       91272
dtype: int64


In [None]:
mdf=mdf.fillna('No Information')

In [None]:
print(mdf.isna().sum())

Loan Amount       0
City              0
State             0
Zip               0
NAICS Code        0
Business Type     0
Race              0
Gender            0
Veteran           0
Jobs Saved        0
Date              0
Bank              0
NAICS Category    0
Title             0
Description       0
dtype: int64


In [None]:
mdf.shape


(4093088, 15)

In [None]:
mdf.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,Title,Description
3919414,14995.0,ANYELOPE,CA,95843,541618,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,1,2020-05-09,Bank of the West,54,Other Management Consulting Services,This U.S. industry comprises establishments pr...


In [None]:
m_cats = pd.merge(left=mdf, right=n_cats, how='left', left_on='NAICS Category', right_on='cat')
m_cats.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,Title,Description,id,cat,description,biz_count
3067099,10000.0,HONOLULU,HI,96813,611519,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,1,2020-05-02,Central Pacific Bank,61,Other Technical and Trade Schools,This U.S. industry comprises establishments pr...,19.0,61,Educational Services,424190.0


In [None]:
m_cats.shape

(4093088, 19)

In [None]:
columns = ['id', 'biz_count', 'cat']
m_cats.drop(columns, inplace=True, axis=1)

In [None]:
m_cats.head()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,Title,Description,description
0,149961.0,KANSAS CITY,MO,64108,54199,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,13,2020-04-13,Country Club Bank,54,"All Other Professional, Scientific, and Techni...",See industry description for 541990.,Professional Scientific and Technical Services
1,149927.67,O FALLON,MO,63366,722511,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,1,2020-04-07,First State Bank of St. Charles,72,Full-Service Restaurants,This U.S. industry comprises establishments pr...,Accommodation and Food Services
2,149900.0,RAYTOWN,MO,64133,44112,Corporation,White,Male Owned,Unanswered,14,2020-05-11,Blue Ridge Bank and Trust Co.,44,Used Car Dealers,See industry description for 441120.,Retail Trade
3,149900.0,VALLEY PARK,MO,63088,62441,Corporation,Unanswered,Unanswered,Unanswered,28,2020-04-09,Meramec Valley Bank,62,Child Day Care ServicesT,See industry description for 624410.,Health Care and Social Assistance
4,149897.5,SPRINGFIELD,MO,65802,561422,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,1,2020-06-25,Wood & Huston Bank,56,Telemarketing Bureaus and Other Contact Centers,This U.S. industry comprises establishments pr...,Waste Management and Remediation Services


In [None]:
m_cats.to_csv('all_loans_under_150k_max_info.csv', index=0)
!cp all_loans_under_150k_max_info.csv "drive/My Drive/"

In [None]:
# # backup of clean dataframe, types will not be fully preserved so on import reassign, but easier with unconformity removed
# path="/content/drive/My Drive/p_cat_clean.csv"
# scdf = pd.read_csv(path,  dtype={'NAICS Category': str, 'NAICS Code Description':str,})
# scdf.sample()

In [None]:
# scdf.dtypes

In [None]:
# scdf.shape

In [None]:
#
