Final general cleanup of loans under 150k from all states and US territories 

In [None]:
#import dependencies
import pandas as pd
import numpy as np
from datetime import date, datetime
import os, sys 
import io
import ee


In [None]:
## set up pandas dataframe using set_options to warn when we are working on a copy instead of ooriginal dataframe
pd.set_option('mode.chained_assignment','warn')

In [None]:
# dataframe will display without being truncated
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 1000)

In [None]:
# !pip install -U -q PyDrive

In [None]:
#mounting the google drive to access the files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# command shows file path
# ! ls

In [None]:
from google.colab import files

In [None]:
# filepath
path = "/content/drive/My Drive/data/quasi2.csv"




In [None]:
#read the csv, create dataframe
df = pd.read_csv(path,)


  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df.head(2)

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Non Profit,Jobs Saved,Date,Bank
0,149961.0,KANSAS CITY,MO,64108,541990.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,13.0,04/13/2020,Country Club Bank
1,149927.67,O FALLON,MO,63366,722511.0,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,,,04/07/2020,First State Bank of St. Charles


In [None]:
#shape
df.shape


(4224119, 13)

In [None]:
#check types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4224119 entries, 0 to 4224118
Data columns (total 13 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Loan Amount    float64
 1   City           object 
 2   State          object 
 3   Zip            object 
 4   NAICS Code     float64
 5   Business Type  object 
 6   Race           object 
 7   Gender         object 
 8   Veteran        object 
 9   Non Profit     object 
 10  Jobs Saved     object 
 11  Date           object 
 12  Bank           object 
dtypes: float64(2), object(11)
memory usage: 419.0+ MB


In [None]:
print(df.isna().sum())

Loan Amount            0
City                 248
State                871
Zip                  208
NAICS Code        126705
Business Type       3330
Race                   3
Gender                 0
Veteran                0
Non Profit       4083421
Jobs Saved        285014
Date                  81
Bank                   0
dtype: int64


check jobs saved column


In [None]:
# PPP loans were only given to people who had "employees", however self employed people
#are counted as an employee for the purpose of the loan. Best guess is that recipients of loan
# below 25k would be fairly small business and will count as 1 employee. Not sure for values above that.
df.loc[df['Loan Amount'] <= 25000.00, "Jobs Saved"] = "1"
#check for nulls
print(df.isna().sum())

Loan Amount            0
City                 248
State                871
Zip                  208
NAICS Code        126705
Business Type       3330
Race                   3
Gender                 0
Veteran                0
Non Profit       4083421
Jobs Saved        113631
Date                  81
Bank                   0
dtype: int64


In [None]:
df.shape

(4224119, 13)

In [None]:
#dropping rows with zero value in column
df['Jobs Saved'].fillna(df['Jobs Saved'].mode().values[0], inplace = True)

double check that jobs saved is not missing values


In [None]:
jcount=df.loc[df['Jobs Saved'] != '']
jcount.count()




Loan Amount      4224119
City             4223871
State            4223248
Zip              4223911
NAICS Code       4097414
Business Type    4220789
Race             4224116
Gender           4224119
Veteran          4224119
Non Profit        140698
Jobs Saved       4224119
Date             4224038
Bank             4224119
dtype: int64

In [None]:
print(df.isna().sum())

Loan Amount            0
City                 248
State                871
Zip                  208
NAICS Code        126705
Business Type       3330
Race                   3
Gender                 0
Veteran                0
Non Profit       4083421
Jobs Saved             0
Date                  81
Bank                   0
dtype: int64


check non profit coluumn

In [None]:
df['Non Profit'].value_counts()

Y              139179
Unanswered       1198
Non-Veteran       310
Veteran            11
Name: Non Profit, dtype: int64

In [None]:
np_df=df[df['Non Profit'] == 'Y']
np_df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Non Profit,Jobs Saved,Date,Bank
2763411,38491.0,DUNDALK,MD,21222,813110.0,Non-Profit Organization,Unanswered,Unanswered,Unanswered,Y,13,04/09/2020,Howard Bank


split non profit "Y" values off into separate csv to explore later as a table in sql and then drop the column. As a percentage of values, it seems like there isn't enough information as a percentage of the total

In [None]:
df.to_csv('non_profit.csv', index=0)
!cp non_profit.csv "drive/My Drive/data"

In [None]:
# drop non profit column. 
df = df.drop('Non Profit', 1)


Dropping null values from rows in several columns


In [None]:
#dropping null values from rows in the following columns
#another option wold be to fill jobs saved in with a value of one because it seems likely that 0 was likely from sole proprietors
df=df.dropna(subset=['Zip', 'Race', 'State', 'City', 'Date', 'Jobs Saved', 'NAICS Code', 'Business Type'])
print(df.isna().sum())



Loan Amount      0
City             0
State            0
Zip              0
NAICS Code       0
Business Type    0
Race             0
Gender           0
Veteran          0
Jobs Saved       0
Date             0
Bank             0
dtype: int64


In [None]:
#percentage of rows lost. 3% - not bad
dif=4224119-4093662
print(dif/4224119)

0.030883836369193198


In [None]:
#lost about 3% of rows
df.shape

(4093662, 12)

NAICS Codes

In [None]:
df['NAICS Code'].value_counts()

722511.0    136768
531210.0    111306
541110.0    108156
621210.0     89360
621111.0     88102
             ...  
48009.0          1
48034.0          1
48043.0          1
48073.0          1
623.0            1
Name: NAICS Code, Length: 1745, dtype: int64

In [None]:
#naics saved should not be a float; convert to string before integer
df['NAICS Code']=df['NAICS Code'].astype(str)

In [None]:
#strip the decimal and zero
df['NAICS Code'] = df['NAICS Code'].str.rstrip('.0')

In [None]:
#take the first two letters of the NAICS code column and create category column. 
#category column can be matched to named values. 
df['NAICS Category'] = df['NAICS Code'].str[:2]
df['NAICS Category'].dtype

dtype('O')

In [None]:
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
1542390,6585.0,PRAIRIE DU CHIEN,WI,53821,53119,Sole Proprietorship,Unanswered,Unanswered,Unanswered,1,04/28/2020,Peoples State Bank,53


In [None]:
df.shape

(4093662, 13)

Date column

In [None]:
# somewhat forcing the date conversion because there must be some values that are not able to be converted to date format 
df['Date'] =  pd.to_datetime(df['Date'], errors='coerce')

In [None]:
df.shape

(4093662, 13)

In [None]:
#check for stray values
# df.Date.value_counts(ascending=True)

In [None]:
# date conversion verification 
df['Date'].dtype

dtype('<M8[ns]')

In [None]:
#check 
df.sample(3)

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
1181348,5000.0,AIKEN,SC,29803,453998,Sole Proprietorship,Unanswered,Male Owned,Non-Veteran,1,2020-06-30,Customers Bank,45
3917145,28550.0,IRVINE,CA,92604,62121,Corporation,Unanswered,Unanswered,Unanswered,5,2020-05-03,Bank of America,62
3412165,27800.0,RALEIGH,NC,27603,56173,Corporation,Unanswered,Unanswered,Unanswered,2,2020-05-01,First-Citizens Bank & Trust Company,56


State column


In [None]:
#number of unique values in a column
df['State'].nunique()

273

In [None]:
df['State'].value_counts()

CA                     470710
FL                     341051
TX                     328235
NY                     261809
IL                     167429
                        ...  
 NY 11552"                  1
 LA."                       1
 AND NEW YORK CITY"         1
 LOUIS"                     1
 MA 02360"                  1
Name: State, Length: 273, dtype: int64

In [None]:
#remove open space to the left side of state
df['State'] = df['State'].str.strip(' ', )

In [None]:
df['State'].value_counts()

CA                     470710
FL                     341051
TX                     328235
NY                     261809
IL                     167429
                        ...  
LA."                        1
AVE"                        1
30669 U.S HWY 19N."         1
NY 10977"                   1
AE                          1
Name: State, Length: 260, dtype: int64

In [None]:
# df['State'] = df.loc[df['State']].str.rstrip('"', )
# #remove the stray "

In [None]:
#taking first two letters 
df['State'] = [x[:2] for x in df['State']]


In [None]:
df=df.groupby('State').filter(lambda x : (x['State'].count()>=7).any())

In [None]:
df['State'].value_counts()

CA    470781
FL    341087
TX    328286
NY    261850
IL    167444
GA    136190
PA    136094
NJ    116632
OH    112418
NC    103574
MI     97110
MA     92441
VA     91757
CO     88693
MN     83186
WA     82909
TN     80744
MO     78890
WI     71637
AZ     67140
MD     67078
IN     65879
LA     63298
OK     57110
AL     57072
SC     54081
IA     52229
OR     52056
CT     50809
KS     45585
UT     42622
MS     41673
KY     41130
NE     38139
AR     37931
NV     33882
PR     31622
ID     26316
ME     23993
HI     21189
MT     20768
SD     20490
NH     19682
NM     18579
ND     17491
WV     14895
RI     14467
WY     11650
VT     10229
DE      9991
AK      9412
DC      9199
GU      1867
VI      1686
MP       417
AS       204
XX        16
TE        11
ST         7
Name: State, dtype: int64

In [None]:
df.shape

(4093618, 13)

In [None]:
# df['State'] = df['State'].str.strip('.', )

In [None]:
# df.shape

In [None]:
# df['States'] = df['States'].str.lstrip(' ', )

In [None]:
df['State'].nunique()

59

In [None]:
df['NAICS Category'].value_counts()

54    553820
81    491278
62    422097
23    380205
72    308883
44    261611
53    227682
56    208301
48    159261
52    154859
45    137492
42    129498
11    129384
71    108352
33     86468
99     83900
61     67447
51     57559
32     38651
31     35583
21     16388
92     11333
49      9824
55      7236
22      6137
78        28
10        22
20        17
91        14
77        14
85        14
60        13
80        13
94        12
95        11
90        10
68        10
30         9
76         9
75         9
93         8
98         8
70         8
82         8
96         8
28         7
73         7
89         6
14         6
34         5
24         5
13         5
64         5
74         5
26         5
97         5
18         4
37         4
12         4
27         4
79         4
19         4
17         4
88         4
84         3
66         3
58         3
63         2
50         2
86         2
29         2
35         2
57         2
39         1
46         1
36         1
65         1

In [None]:
df = df.groupby('NAICS Category').filter(lambda x: len(x) >= 5)

In [None]:
df.shape

(4093555, 13)

In [None]:
# cleaned file of loans under $150k preserving roughly 4 million rows
df.to_csv('jobs_saved.csv', index=0)
!cp jobs_saved.csv "drive/My Drive/"

In [None]:
col=list(df.columns)

In [None]:
#convert the strings to integers in jobs saved
# df['Jobs Saved'] = pd.to_numeric(df['Jobs Saved'], errors="coerce")
# df['Jobs Saved'] = df['Jobs Saved'].astype(int)

In [None]:
# df['Jobs Saved'].fillna(0, inplace = True)
df.shape

(4093555, 13)

In [None]:
df.dtypes

Loan Amount              float64
City                      object
State                     object
Zip                       object
NAICS Code                object
Business Type             object
Race                      object
Gender                    object
Veteran                   object
Jobs Saved                object
Date              datetime64[ns]
Bank                      object
NAICS Category            object
dtype: object

In [None]:
df.sample(12)

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
2488883,14933.0,GENEVA,FL,32732,236115,Corporation,Unanswered,Unanswered,Unanswered,1.0,2020-05-07,Wells Fargo Bank,23
152993,42778.0,SKOKIE,IL,60076,53119,Self-Employed Individuals,Unanswered,Unanswered,Unanswered,2.0,2020-06-03,Kabbage,53
154323,41666.0,CHICAGO,IL,60657,54141,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,2.0,2020-04-30,Bank of America,54
1777118,20833.0,FRAMINGHAM,MA,1701,54111,Sole Proprietorship,Unanswered,Unanswered,Unanswered,1.0,2020-06-06,Bank of America,54
348982,13177.5,ARLINGTON,VA,22204,44711,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,1.0,2020-04-27,Burke & Herbert Bank & Trust Company,44
3374990,105600.0,DURHAM,NC,27701,339992,Corporation,Unanswered,Unanswered,Unanswered,21.0,2020-05-03,PNC Bank,33
836750,44592.0,AURORA,CO,80015,23821,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,8.0,2020-05-30,U.S. Bank,23
1110131,20800.0,LOGAN,UT,84341,621111,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,1.0,2020-04-27,Bank of Utah,62
1740683,73860.0,SHARON,MA,2067,23622,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,8.0,2020-04-27,Eastern Bank,23
727378,18400.0,MONTGOMERY,AL,36109,812112,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,1.0,2020-05-05,Regions Bank,81


In [None]:
df.shape

(4093555, 13)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4093555 entries, 0 to 4224118
Data columns (total 13 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Loan Amount     float64       
 1   City            object        
 2   State           object        
 3   Zip             object        
 4   NAICS Code      object        
 5   Business Type   object        
 6   Race            object        
 7   Gender          object        
 8   Veteran         object        
 9   Jobs Saved      object        
 10  Date            datetime64[ns]
 11  Bank            object        
 12  NAICS Category  object        
dtypes: datetime64[ns](1), float64(1), object(11)
memory usage: 437.2+ MB


In [None]:
df.to_csv('clean_ppp.csv', index=0)
!cp clean_ppp.csv "drive/My Drive/"


In [None]:
# data looks pretty good. 
df.head()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category
0,149961.0,KANSAS CITY,MO,64108,54199,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,13,2020-04-13,Country Club Bank,54
1,149927.67,O FALLON,MO,63366,722511,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,1,2020-04-07,First State Bank of St. Charles,72
2,149900.0,RAYTOWN,MO,64133,44112,Corporation,White,Male Owned,Unanswered,14,2020-05-11,Blue Ridge Bank and Trust Co.,44
3,149900.0,VALLEY PARK,MO,63088,62441,Corporation,Unanswered,Unanswered,Unanswered,28,2020-04-09,Meramec Valley Bank,62
4,149897.5,SPRINGFIELD,MO,65802,561422,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,1,2020-06-25,Wood & Huston Bank,56


In [None]:
df.shape

(4093555, 13)