# Pre-Processing
This notebook will pre-process a cleaned KivaMaxApprover project .csv dataset.

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import FreqDist, pos_tag
from sklearn.feature_extraction import text

In [2]:
#establish input & output filenames

filename = ('data/kenya_cleaned_nlp.csv')
op_filename = ('data/kenya_pre_processed_nlp.csv')

In [3]:
# read in data 
kiva = pd.read_csv(filename)

In [4]:
kiva.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
0,1799331,Dinnah is 43 years of age and a proud mother o...,to buy farm inputs such as seeds and fertilize...,#Parent,1
1,1294719,Resy is a married woman and has been blessed w...,to purchase chicks and poultry feed.,"#Animals, #Woman-Owned Business, volunteer_pic...",1
2,1595847,Lavenda is happily married and has been blesse...,to add stock of beauty products to her salon,"user_favorite, #Parent, #Woman-Owned Business",0
3,1139606,Hadija is a Kiva borrower. She describes herse...,"to buy more stock of vegetables, flour, sugar,...","#Repeat Borrower, #Woman-Owned Business",1
4,1813411,"Purity, aged 28, is a lovely mother of two chi...",to purchase high-quality seeds and nutrient-ri...,,1


In [5]:
text_columns = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS']


In [6]:
# #ensure that all text fields are strings

# for column in text_columns:
#   kiva[column] = kiva[column].astype(str)


### Remove Special Characters

In [7]:
for column in text_columns:
    kiva[column] = kiva[column].replace('[^\w ]','',regex=True).astype(str) 

### Tokenizer & Lowercase

In [8]:
w_tokenizer = WhitespaceTokenizer()

In [9]:
for column in text_columns: 
    kiva[column] = kiva[column].apply(lambda x: w_tokenizer.tokenize(x.lower()))

### Remove English Stopwords

In [10]:
stop_words = text.ENGLISH_STOP_WORDS
# Remove stopwords function

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [11]:
for column in ['DESCRIPTION_TRANSLATED', 'LOAN_USE']:
    kiva[column] = kiva[column].apply(lambda x: remove_stopwords(x))

In [12]:
kiva['DESCRIPTION_TRANSLATED'][0]

['dinnah',
 '43',
 'years',
 'age',
 'proud',
 'mother',
 '5',
 '4',
 'schoolgoing',
 'active',
 'woman',
 'works',
 'agriculture',
 'skill',
 'inherited',
 'parents',
 'br',
 'br',
 'addition',
 'engaging',
 'small',
 'business',
 'activities',
 'dinnah',
 'hardworking',
 'mixed',
 'farmer',
 'past',
 'years',
 'growing',
 'vegetables',
 'maize',
 'keeping',
 'dairy',
 'cows',
 'activities',
 'able',
 'earn',
 'decent',
 'honest',
 'income',
 'br',
 'br',
 'dinnah',
 'making',
 'profits',
 'farming',
 'easy',
 'humble',
 'industrious',
 'mother',
 'farm',
 'producing',
 'harvest',
 'lately',
 'lack',
 'fertilizer',
 'good',
 'seeds',
 'presence',
 'pests',
 'diseases',
 'br',
 'br',
 'seeking',
 'loan',
 'buy',
 'farm',
 'inputs',
 'fertilizers',
 'seeds',
 'expand',
 'crop',
 'farm',
 'br',
 'br',
 'loan',
 'dinnah',
 'assured',
 'bumper',
 'harvest',
 'mean',
 'profits',
 'income',
 'using',
 'loan',
 'able',
 'expand',
 'enterprise',
 'br',
 'br',
 'dinnah',
 'hopes',
 'consider',


In [13]:
kiva.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
0,1799331,"[dinnah, 43, years, age, proud, mother, 5, 4, ...","[buy, farm, inputs, seeds, fertilizer, expand,...",[parent],1
1,1294719,"[resy, married, woman, blessed, kids, attend, ...","[purchase, chicks, poultry, feed]","[animals, womanowned, business, volunteer_pick...",1
2,1595847,"[lavenda, happily, married, blessed, children,...","[add, stock, beauty, products, salon]","[user_favorite, parent, womanowned, business]",0
3,1139606,"[hadija, kiva, borrower, describes, honest, op...","[buy, stock, vegetables, flour, sugar, soap, i...","[repeat, borrower, womanowned, business]",1
4,1813411,"[purity, aged, 28, lovely, mother, children, c...","[purchase, highquality, seeds, nutrientrich, f...",[nan],1


#### Save File

In [14]:
#save data 
kiva.to_csv(op_filename)