In [None]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
kiva_df = pd.read_csv("data/kiva_cleaned.csv")
kiva_df.info()

In [None]:
kiva_df = kiva_df.dropna()
kiva_df = kiva_df.reset_index(drop=True)

In [None]:
kiva_df.info()

In [None]:
kiva_df.head()

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import unidecode

stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

lemmer = WordNetLemmatizer()

def preprocess(x):
    x = x.lower()
    
    x = re.sub(r'[^\w\s]', '', x)
    
    x = unidecode.unidecode(x)
    
    x = re.sub(r'\d+', '', x)
    
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]

    return ' '.join(x)

kiva_df['en_clean_pre'] = kiva_df['en_clean'].apply(preprocess)

In [None]:
kiva_df.head()

In [None]:
kiva_df.iloc[0, :].en_clean

In [None]:
kiva_df.iloc[0, :].en_clean_pre

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df = 0.5, min_df=0.05, max_features = 1000, ngram_range=[1,3])
dtm = vectorizer.fit_transform(kiva_df['en_clean_pre'])

In [None]:
dtm.shape

In [None]:
vectorizer.get_feature_names()

In [None]:
bow_df = pd.DataFrame(dtm.toarray(), 
                      columns=vectorizer.get_feature_names(), 
                      index=kiva_df.index)

kiva_df_bow = pd.concat([kiva_df, bow_df], axis=1)
kiva_df_bow.shape
kiva_df_bow.head()

In [None]:
import textstat
  
kiva_df['len'] = kiva_df['en_clean'].apply(lambda x: len(x))
kiva_df['syllable_count'] = kiva_df['en_clean'].apply(
    lambda x: textstat.syllable_count(x))
kiva_df['flesch_reading_ease'] = kiva_df['en_clean'].apply(
    lambda x: textstat.flesch_reading_ease(x))

kiva_df.head()

In [None]:
kiva_df_bow = kiva_df_bow.drop(
    columns=['loan_id', 'sector', 'country', 'gender', 
             'loan_amount', 'nonpayment', 'en', 'en_clean', 'en_clean_pre'],
    axis=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

y = kiva_df_bow['status']
X = kiva_df_bow.drop(['status'], axis=1)

feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
clf = DecisionTreeClassifier(random_state=42, 
                             min_samples_split=10, 
                             min_samples_leaf=10, 
                             max_depth=6)

clf.fit(X_train, y_train)

y_pred_dt = clf.predict(X_test)

In [None]:
clf.feature_importances_

In [None]:
imp = clf.tree_.compute_feature_importances(normalize=False)
ind = sorted(range(len(imp)), key=lambda i: imp[i])[-15:]

imp[ind]
feature_names[ind]

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_dt)

In [None]:
from sklearn.metrics import classification_report

class_names = [str(x) for x in clf.classes_]

print(classification_report(y_test, y_pred_dt, target_names=class_names))

In [None]:
# Needs sklearn 0.21 or higher
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=[10,5]);
plot_tree(clf, filled=True, feature_names = feature_names, label='root', fontsize=10)
plt.show();

In [None]:
from sklearn.tree import export_text

print(export_text(clf, feature_names = list(feature_names)))