# Pre-Processing
This notebook will pre-process a cleaned KivaMaxApprover project .csv dataset.

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import FreqDist, pos_tag
from sklearn.feature_extraction import text

In [2]:
# #read in data on colab
# kiva = pd.read_csv('/content/drive/MyDrive/Colab/group_project/cleaned_nlp.csv')

#read in data on drive
kiva = pd.read_csv('data/cleaned_nlp.csv')

In [3]:
kiva.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
0,1455352,The city of Portoviejo is located in the valle...,to purchase natural products.,"#Repeat Borrower, #Health and Sanitation",1
1,1727469,"Lorna is a married woman, 39 years old with fi...","to purchase additional stocks of Avon, Natasha...","#Woman-Owned Business, #Parent",0
2,1747998,Anita is a 32-year-old married woman residing ...,"to purchase lentils, oil, salt, etc. in bulk i...",#Woman-Owned Business,1
3,1342372,"Saeeda is a 45-year-old woman, living with her...",to buy embroidery raw materials such as thread...,"#Fabrics, #Woman-Owned Business, user_favorite...",1
4,1632606,Pablo is an enterprising young man who has the...,to buy a POS (point of sale) terminal that wil...,"#Single, #Technology, #Biz Durable Asset",0


In [4]:
text_columns = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS']


In [5]:
# #ensure that all text fields are strings

# for column in text_columns:
#   kiva[column] = kiva[column].astype(str)


### Remove Special Characters

In [6]:
for column in text_columns:
    kiva[column] = kiva[column].replace('[^\w ]','',regex=True).astype(str) 

### Tokenizer & Lowercase

In [7]:
w_tokenizer = WhitespaceTokenizer()

In [8]:
for column in text_columns: 
    kiva[column] = kiva[column].apply(lambda x: w_tokenizer.tokenize(x.lower()))

### Remove English Stopwords

In [9]:
stop_words = text.ENGLISH_STOP_WORDS
# Remove stopwords function

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [10]:
for column in ['DESCRIPTION_TRANSLATED', 'LOAN_USE']:
    kiva[column] = kiva[column].apply(lambda x: remove_stopwords(x))

In [11]:
kiva['DESCRIPTION_TRANSLATED'][0]

['city',
 'portoviejo',
 'located',
 'valley',
 'portoviejo',
 'river',
 'known',
 'city',
 'royal',
 'tamarinds',
 'affected',
 'earthquake',
 'april',
 '16',
 '2016',
 'city',
 'rich',
 'production',
 'vegetables',
 'legumes',
 'tropical',
 'fruits',
 'local',
 'consumptionbr',
 'br',
 'raisa',
 '25',
 'years',
 'old',
 'commonlaw',
 'relationship',
 'fouryearold',
 'child',
 'school',
 'partner',
 'locksmith',
 'live',
 'family',
 'home',
 'br',
 'br',
 'raisa',
 'works',
 'selling',
 'natures',
 'garden',
 'natural',
 'products',
 'natural',
 'health',
 'beauty',
 'products',
 'father',
 'goes',
 'make',
 'sales',
 'saturdays',
 'sundays',
 'lowincome',
 'neighborhoods',
 'offering',
 'products',
 'credit',
 'collecting',
 'payments',
 'week',
 'customers',
 'buses',
 'carrying',
 'merchandise',
 'br',
 'br',
 'business',
 'years',
 'works',
 'monday',
 'friday',
 'private',
 'business',
 'new',
 'loan',
 'buy',
 'natural',
 'products',
 'br',
 'br',
 'likes',
 'loans',
 'rates',
 

In [12]:
kiva.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
0,1455352,"[city, portoviejo, located, valley, portoviejo...","[purchase, natural, products]","[repeat, borrower, health, and, sanitation]",1
1,1727469,"[lorna, married, woman, 39, years, old, childr...","[purchase, additional, stocks, avon, natasha, ...","[womanowned, business, parent]",0
2,1747998,"[anita, 32yearold, married, woman, residing, j...","[purchase, lentils, oil, salt, bulk, order, ex...","[womanowned, business]",1
3,1342372,"[saeeda, 45yearold, woman, living, spouse, are...","[buy, embroidery, raw, materials, thread, fanc...","[fabrics, womanowned, business, user_favorite,...",1
4,1632606,"[pablo, enterprising, young, man, drive, forwa...","[buy, pos, point, sale, terminal, make, transa...","[single, technology, biz, durable, asset]",0


In [13]:
# # save the entire dataset
# kiva.to_csv('/content/drive/MyDrive/Colab/group_project/pre_processed_kiva.csv')

#save data on local machine
kiva.to_csv('data/pre_processed_nlp.csv')