# Cleaning

This notebook creates a five-column dataframe with no null values for NLP pre-processing for the KivaMaxApprover project. 

In [None]:
import pandas as pd
import numpy as np

In [None]:
#read in csv
kiva_large = pd.read_csv('/content/drive/MyDrive/Colab/group_project/kivamix.csv')
kiva_large.head(5)

In [None]:
#create a dataframe with only columns relevant to NLP and set index = LOAN_ID
kiva = kiva_large[['LOAN_ID', 'DESCRIPTION_TRANSLATED','LOAN_USE','TAGS', 'STATUS']].set_index('LOAN_ID', drop=True)
kiva.head()

Unnamed: 0_level_0,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
LOAN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1455352,The city of Portoviejo is located in the valle...,to purchase natural products.,"#Repeat Borrower, #Health and Sanitation",1
1727469,"Lorna is a married woman, 39 years old with fi...","to purchase additional stocks of Avon, Natasha...","#Woman-Owned Business, #Parent",0
1747998,Anita is a 32-year-old married woman residing ...,"to purchase lentils, oil, salt, etc. in bulk i...",#Woman-Owned Business,1
1342372,"Saeeda is a 45-year-old woman, living with her...",to buy embroidery raw materials such as thread...,"#Fabrics, #Woman-Owned Business, user_favorite...",1
1632606,Pablo is an enterprising young man who has the...,to buy a POS (point of sale) terminal that wil...,"#Single, #Technology, #Biz Durable Asset",0


In [None]:
kiva.isnull().sum()

DESCRIPTION_TRANSLATED     3253
LOAN_USE                   3246
TAGS                      71842
STATUS                        0
dtype: int64

In [None]:
kiva.shape

(419156, 4)

In [None]:
#there are a few hundred rows with null values in all three columns (i.e. no text at all)
#drop these rows with no language data
#note to self/team - i assume this will be captured in the numerical model?
kiva = kiva.dropna(axis = 0, how = 'all', subset = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS'])
kiva.shape

(418635, 4)

In [None]:
#check for duplicate entries
kiva.duplicated(subset = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS'], keep='last').sum()

1095

In [None]:
#remove duplicates
kiva = kiva.drop_duplicates(subset = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS'], keep = 'last')
kiva.shape

(417540, 4)

In [None]:
#handle null values 
kiva = kiva.fillna('')
kiva.isnull().sum()

DESCRIPTION_TRANSLATED    0
LOAN_USE                  0
TAGS                      0
STATUS                    0
dtype: int64

In [None]:
kiva['DESCRIPTION_TRANSLATED'] = kiva['DESCRIPTION_TRANSLATED'].astype(str)
kiva['LOAN_USE'] = kiva['LOAN_USE'].astype(str)
kiva['TAGS'] = kiva['TAGS'].astype(str)

In [None]:
kiva.to_csv('/content/drive/MyDrive/Colab/group_project/cleaned_nlp.csv')