This workbook cleans up the data in a CSV called people_ppp and appends NAICS code information to it. The results are stored in a CSV called: "p_cat_clean.csv'"

 

In [None]:
#import dependencies
import pandas as pd
import numpy as np
from datetime import date, datetime
import os, sys 
import io
import ee


In [None]:
## set up pandas dataframe using set_options to warn when we are working on a copy instead of ooriginal dataframe
pd.set_option('mode.chained_assignment','warn')

In [None]:
# dataframe will display without being truncated
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 768)

In [None]:
# !pip install -U -q PyDrive

In [None]:
#mounting the google drive to access the files
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# command shows file path
# ! ls

In [None]:
n_path = "/content/drive/My Drive/data/naics-cat.csv"
nf_path = "/content/drive/My Drive/data/2017-NAICS-Descriptions.csv"

In [None]:
ndf = pd.read_csv( n_path, index_col=0, dtype={'cat':'string'}, na_values = ['<NA>', 'no information'], )
ndf.sample()

In [None]:
# Read column names from file
cols = list(pd.read_csv(nf_path, nrows =1))
print(cols)

['Code', 'Title', 'Description']


In [None]:
# nf_df = pd.read_csv( nf_path, dtype={'Code':'string', 'Title':'string', 'Description':'string'}, na_values = ['<NA>', '.'], )
# nf_df.sample()

In [None]:
nf_df = pd.read_csv( nf_path, dtype={'Code':'string',},)
nf_df.sample()

Unnamed: 0,Code,Title,Description
2102,813319,Other Social Advocacy Organizations,This U.S. industry comprises establishments pr...


In [None]:
nf_df['Description'] = nf_df['Description'].fillna('No information')
nf_df.sample(50)

Unnamed: 0,Code,Title,Description
72,11233,Turkey ProductionT,See industry description for 112330.
1365,4885,Freight Transportation ArrangementT,No information
793,33461,Manufacturing and Reproducing Magnetic and Opt...,This industry comprises establishments primari...
1428,512250,Record Production and Distribution,This industry comprises establishments primari...
87,112519,Other Aquaculture,This U.S. industry comprises establishments pr...
1955,712,"Museums, Historical Sites, and Similar Institu...","Industries in the Museums, Historical Sites, a..."
419,3162,Footwear ManufacturingT,No information
489,3251,Basic Chemical ManufacturingT,This industry group comprises establishments p...
1949,7114,"Agents and Managers for Artists, Athletes, Ent...",No information
594,32741,Lime ManufacturingT,See industry description for 327410.


In [None]:
# df[weird]
for col in nf_df.columns:
    weird = (nf_df[[col]].applymap(type) != nf_df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(nf_df[weird]) > 0:
        print(col)


In [None]:
nf_df.dtypes

Code           string
Title          object
Description    object
dtype: object

In [None]:
# filepath
path = "/content/drive/My Drive/people_ppp.csv"




In [None]:
from functools import reduce



In [None]:
#read the csv, create dataframe
# df = pd.read_csv(path,)
df = pd.read_csv( path, index_col=0, na_values = ['no info', '.'], )

In [None]:
df.shape

In [None]:
# df[weird]
for col in df.columns:
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird]) > 0:
        print(col)

In [None]:
df.sample()

In [None]:
df.loc[:,'NAICS Code'] = df.loc[:, 'NAICS Code'].astype(str)

In [None]:
df.loc[:,'Zip'] = df.loc[:, 'Zip'].astype(str)

In [None]:
df.loc[:,'Jobs Saved'] = df.loc[:, 'Jobs Saved'].astype(int)

In [None]:
#somewhat forcing the date conversion because there must be some values that are not in date format 
df.loc[:,'Date'] =  pd.to_datetime(df.loc[:,'Date'], errors='coerce', )

In [None]:
df.dtypes

In [None]:
#check for nulls
# print(df.isna().sum())

In [None]:
df.dropna(inplace=True)
df.sample()

In [None]:
# print(df.isna().sum())

In [None]:

for col in df.columns:
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df[weird]) > 0:
        print(col)

In [None]:
# weird = (df.applymap(type) != df.iloc[0].apply(type)).any(axis=1)
# df[weird]

In [None]:
# #check for nulls
# print(df.isna().sum())


In [None]:
#verified
df.shape

In [None]:
#take the first two letters of the NAICS code column and create category column. 
#category column can be matched to named values. The longer code was impractical
df['NAICS Category'] = df['NAICS Code'].str[:2].astype(str)
df.info()

In [None]:
#create new column for NAICS code description
#take the first two letters of the NAICS code column and create category column. 
#category column can be matched to named values. The longer code was impractical
# df['NAICS Code Description'] = df['NAICS Code'].astype(str)
# df.info()

In [None]:
# df.sample()

In [None]:
banks=df.loc[:,'Bank'].value_counts()

In [None]:
banks.sample()

In [None]:
# df['Bank'].where(df['Bank'].apply(lambda x:(x.value_counts()))>=500, "other")

In [None]:
# cond = df['Bank'].value_counts()
# threshold = 500
# df['Bank'] = np.where(df['Bank'].isin(cond.index[cond >= threshold ]), df['Bank'], 'miscellaneous')

In [None]:
# df['Bank'].value_counts()

In [None]:
# df['Bank'] = df[df['Bank'] != '05/21/2020 ']

In [None]:
# df.sample()

In [None]:
df['Bank']=df['Bank'].str.lstrip('\t')
# df['Bank'] = df[df['Bank'] != '05/21/2020 ']

In [None]:
df.sample()

In [None]:
df.sample(3)

In [None]:
# df= df.groupby('NAICS Category').filter(lambda x: len(x) >= 5)


In [None]:
# df.dropna(inplace=True)
df.head(1)

In [None]:
df.shape

In [None]:
# join n_path and df make merged df
df = pd.merge(left=df, right=ndf, left_on='NAICS Category', right_on='cat')
df.sample(12)







In [None]:
nf_df.sample()


In [None]:
nf_df.rename(columns={'Code': 'NAICS Code',}, inplace=True)
nf_df.sample()

Unnamed: 0,NAICS Code,Title,Description
773,3344,Semiconductor and Other Electronic Component M...,No information


In [None]:
mdf=reduce(lambda x,y: pd.merge(x,y, on='NAICS Code', how='outer'), [df,  nf_df])


In [None]:
mdf.sample()


Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS_cat,NAICS Category,cat,description,biz_count,Title,Description
104171,54736.05,FOREST HILLS,NY,11375,62132,Subchapter S Corporation,White,Male Owned,Non-Veteran,13.0,2020-04-27,Savoy Bank,62.0,62,62,Health Care and Social Assistance,1745915.0,Offices of OptometristsT,See industry description for 621320.


In [None]:
mdf=mdf.drop(['cat', 'biz_count', 'NAICS_cat', 'Description'], axis=1).copy()

In [None]:
mdf.sample()

In [None]:
mdf.shape
mdf.dtypes

In [None]:
print(mdf.isna().sum())

In [None]:
mdf.dropna(inplace=True)
print(mdf.isna().sum())

Loan Amount       0
City              0
State             0
Zip               0
NAICS Code        0
Business Type     0
Race              0
Gender            0
Veteran           0
Jobs Saved        0
Date              0
Bank              0
NAICS Category    0
description       0
Title             0
dtype: int64


In [None]:
mdf.sample()


Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,description,Title
73511,77760.0,NEWPORT BEACH,CA,92660,56111,Corporation,White,Female Owned,Non-Veteran,3.0,2020-04-10,Golden Valley Bank,56,Waste Management and Remediation Services,Office Administrative ServicesT


In [None]:
mdf.shape

(243964, 15)

In [None]:
mdf.to_csv('p_cat_clean.csv', index=0)
!cp p_cat_clean.csv "drive/My Drive/"

In [None]:
def to_csv(mdf, path):
    # Prepend dtypes to the top of df
    df2 = mdf.copy()
    df2.loc[-1] = df2.dtypes
    df2.index = df2.index + 1
    df2.sort_index(inplace=True)
    # Then save it to a csv
    df2.to_csv('p_cat_clean.csv', index=0)
    !cp p_cat_clean.csv "drive/My Drive/"
    df2.to_csv(path, index=False)

def read_csv(path):
    # Read types first line of csv
    path="/content/drive/My Drive/p_cat_clean.csv"
    dtypes = {key:value for (key,value) in pd.read_csv(path,    
              nrows=1).iloc[0].to_dict().items() if 'date' not in value}

    parse_dates = [key for (key,value) in pd.read_csv(path, 
                   nrows=1).iloc[0].to_dict().items() if 'date' in value]
    # Read the rest of the lines with the types from above
    return pd.read_csv(path, dtype=dtypes, parse_dates=parse_dates, skiprows=[1])

In [None]:
# backup of clean dataframe, types will not be fully preserved so on import reassign, but easier with unconformity removed
path="/content/drive/My Drive/p_cat_clean.csv"
scdf = pd.read_csv(path,  dtype={'NAICS Category': str, 'NAICS Code Description':str,})
scdf.sample()

In [None]:
scdf.dtypes

In [None]:
scdf.shape

In [None]:
#
