# Cleaning

This notebook creates a five-column dataframe with no null values for NLP pre-processing for the KivaMaxApprover project

In [1]:
import pandas as pd
import numpy as np

In [5]:
# set input and output filenames

filename = ('./data/kivamix.csv')
op_filename = ('./data/cleaned_nlp.csv')

In [2]:
#read in csv
kiva_large = pd.read_csv(filename)

kiva_large.head(5)

Unnamed: 0,LOAN_ID,LOAN_NAME,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,FUNDED_AMOUNT,LOAN_AMOUNT,STATUS,IMAGE_ID,VIDEO_ID,...,NUM_LENDERS_TOTAL,NUM_JOURNAL_ENTRIES,NUM_BULK_ENTRIES,TAGS,BORROWER_NAMES,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,year
0,1455352,Raisa Jokasta,Spanish,"En la ciudad de Portoviejo, conocida como la c...",The city of Portoviejo is located in the valle...,1075.0,1075.0,1,2638561.0,,...,12,1,1,"#Repeat Borrower, #Health and Sanitation",Raisa Jokasta,female,True,monthly,field_partner,2018
1,1727469,Lorna,English,"Lorna is a married woman, 39 years old with fi...","Lorna is a married woman, 39 years old with fi...",225.0,400.0,0,3108106.0,,...,8,1,1,"#Woman-Owned Business, #Parent",Lorna,female,True,monthly,field_partner,2019
2,1747998,Anita,English,Anita is a 32-year-old married woman residing ...,Anita is a 32-year-old married woman residing ...,300.0,300.0,1,3134774.0,,...,9,1,1,#Woman-Owned Business,Anita,female,True,monthly,field_partner,2019
3,1342372,Saeeda,English,"Saeeda is a 45-year-old woman, living with her...","Saeeda is a 45-year-old woman, living with her...",300.0,300.0,1,2588292.0,,...,9,2,1,"#Fabrics, #Woman-Owned Business, user_favorite...",Saeeda,female,True,monthly,field_partner,2017
4,1632606,JUAN PABLO,Spanish,Pablo es un joven muy emprendedor y con muchas...,Pablo is an enterprising young man who has the...,225.0,850.0,0,2981723.0,,...,8,1,1,"#Single, #Technology, #Biz Durable Asset",JUAN PABLO,male,True,monthly,field_partner,2018


In [None]:
#create a dataframe with only columns relevant to NLP and set index = LOAN_ID
kiva = kiva_large[['LOAN_ID', 'DESCRIPTION_TRANSLATED','LOAN_USE','TAGS', 'STATUS']].set_index('LOAN_ID', drop=True)
kiva.head()

Unnamed: 0_level_0,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
LOAN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1455352,The city of Portoviejo is located in the valle...,to purchase natural products.,"#Repeat Borrower, #Health and Sanitation",1
1727469,"Lorna is a married woman, 39 years old with fi...","to purchase additional stocks of Avon, Natasha...","#Woman-Owned Business, #Parent",0
1747998,Anita is a 32-year-old married woman residing ...,"to purchase lentils, oil, salt, etc. in bulk i...",#Woman-Owned Business,1
1342372,"Saeeda is a 45-year-old woman, living with her...",to buy embroidery raw materials such as thread...,"#Fabrics, #Woman-Owned Business, user_favorite...",1
1632606,Pablo is an enterprising young man who has the...,to buy a POS (point of sale) terminal that wil...,"#Single, #Technology, #Biz Durable Asset",0


In [None]:
kiva.isnull().sum()

DESCRIPTION_TRANSLATED     3253
LOAN_USE                   3246
TAGS                      71842
STATUS                        0
dtype: int64

In [None]:
kiva.shape

(419156, 4)

In [None]:
#there are a few hundred rows with null values in all three columns (i.e. no text at all)
#drop these rows with no language data
#note to self/team - i assume this will be captured in the numerical model?
kiva = kiva.dropna(axis = 0, how = 'all', subset = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS'])
kiva.shape

(418635, 4)

In [None]:
#check for duplicate entries
kiva.duplicated(subset = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS'], keep='last').sum()

1095

In [None]:
#remove duplicates
kiva = kiva.drop_duplicates(subset = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS'], keep = 'last')
kiva.shape

(417540, 4)

In [None]:
#handle null values 
kiva = kiva.fillna('')
kiva.isnull().sum()

DESCRIPTION_TRANSLATED    0
LOAN_USE                  0
TAGS                      0
STATUS                    0
dtype: int64

In [None]:
kiva['DESCRIPTION_TRANSLATED'] = kiva['DESCRIPTION_TRANSLATED'].astype(str)
kiva['LOAN_USE'] = kiva['LOAN_USE'].astype(str)
kiva['TAGS'] = kiva['TAGS'].astype(str)

In [4]:


#local machine
kiva.to_csv(op_filename)