#Sample Preprocessing

This notebook pulls a balanced sample of 10,000 observations and saves a preprocessed .csv called 'sample'


1.   Remove Special Characters
2.   Tokenize
3.   Remove Stopwords



In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction import text



In [None]:
#read in data
kiva = pd.read_csv('/content/drive/MyDrive/Colab/group_project/cleaned_nlp.csv')

In [None]:
kiva.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
0,1455352,The city of Portoviejo is located in the valle...,to purchase natural products.,"#Repeat Borrower, #Health and Sanitation",1
1,1727469,"Lorna is a married woman, 39 years old with fi...","to purchase additional stocks of Avon, Natasha...","#Woman-Owned Business, #Parent",0
2,1747998,Anita is a 32-year-old married woman residing ...,"to purchase lentils, oil, salt, etc. in bulk i...",#Woman-Owned Business,1
3,1342372,"Saeeda is a 45-year-old woman, living with her...",to buy embroidery raw materials such as thread...,"#Fabrics, #Woman-Owned Business, user_favorite...",1
4,1632606,Pablo is an enterprising young man who has the...,to buy a POS (point of sale) terminal that wil...,"#Single, #Technology, #Biz Durable Asset",0


In [None]:
#ensure that all text fields are strings

text_columns = ['DESCRIPTION_TRANSLATED', 'LOAN_USE', 'TAGS']

for column in text_columns:
  kiva[column] = kiva[column].astype(str)


### **Create a Subsample of Data to Run Early Models**

In [None]:
sample = kiva.groupby('STATUS').sample(n = 5000)

### Remove Special Characters

In [None]:
for column in text_columns:
  sample[column] = sample[column].replace('[^\w ]','',regex=True).astype(str) 

### Tokenizer

In [None]:
w_tokenizer = WhitespaceTokenizer()

In [None]:
for column in text_columns: 
  sample[column] = sample[column].apply(lambda x: w_tokenizer.tokenize(x.lower()))

### Remove English Stopwords

In [None]:
stop_words = text.ENGLISH_STOP_WORDS
# Remove stopwords function

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [None]:
for column in ['DESCRIPTION_TRANSLATED', 'LOAN_USE']:
  sample[column] = sample[column].apply(lambda x: remove_stopwords(x))

In [None]:
sample.to_csv('/content/drive/MyDrive/Colab/group_project/pre_processed_sample.csv')