# Sentiment Analysis

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction import text

#modeling
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB

In [2]:
# set input and output filenames

filename = ('data/kenya_cleaned_nlp.csv')
op_filename = ('data/kenya_sentiment.csv')

In [3]:
kiva = pd.read_csv(filename)

In [4]:
kiva.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,STATUS
0,1799331,Dinnah is 43 years of age and a proud mother o...,to buy farm inputs such as seeds and fertilize...,#Parent,1
1,1294719,Resy is a married woman and has been blessed w...,to purchase chicks and poultry feed.,"#Animals, #Woman-Owned Business, volunteer_pic...",1
2,1595847,Lavenda is happily married and has been blesse...,to add stock of beauty products to her salon,"user_favorite, #Parent, #Woman-Owned Business",0
3,1139606,Hadija is a Kiva borrower. She describes herse...,"to buy more stock of vegetables, flour, sugar,...","#Repeat Borrower, #Woman-Owned Business",1
4,1813411,"Purity, aged 28, is a lovely mother of two chi...",to purchase high-quality seeds and nutrient-ri...,,1


### Remove Special Characters

In [5]:
kiva['DESCRIPTION_TRANSLATED'] = kiva['DESCRIPTION_TRANSLATED'].replace('[^\w ]','',regex=True).astype(str) 

In [6]:
kiva['DESCRIPTION_TRANSLATED'].head()

0    Dinnah is 43 years of age and a proud mother o...
1    Resy is a married woman and has been blessed w...
2    Lavenda is happily married and has been blesse...
3    Hadija is a Kiva borrower She describes hersel...
4    Purity aged 28 is a lovely mother of two child...
Name: DESCRIPTION_TRANSLATED, dtype: object

In [7]:
kiva['STATUS'].value_counts()

1    39671
0    11018
Name: STATUS, dtype: int64

### Set up Sentiment Analyzer

In [8]:
# Instantiate Sentiment Intensity Analyzer
sentiment = SentimentIntensityAnalyzer()

In [9]:
kiva['DESCRIPTION_TRANSLATED'][0]

'Dinnah is 43 years of age and a proud mother of 5 of whom  4 are schoolgoing She is an active woman who works in agriculture a skill that she inherited from her parents br br In addition to engaging in some small business activities Dinnah has been a very hardworking mixed farmer for the past few years growing vegetables and maize and keeping dairy cows Through these activities she has been able to earn a decent and honest income br br Although Dinnah has been making profits through farming not everything has been easy for this humble and industrious mother Her farm has not been producing as much of a harvest lately due to a lack of fertilizer and good seeds and the presence of pests and diseases br br That is why she is seeking a loan to buy farm inputs such as fertilizers and seeds so as to expand her crop farm br br Through this loan Dinnah will be assured of a bumper harvest which will mean more profits and income Using this loan she will be able to expand her enterprise br br Din

In [10]:
sentiment.polarity_scores(kiva['DESCRIPTION_TRANSLATED'][0])

{'neg': 0.01, 'neu': 0.83, 'pos': 0.16, 'compound': 0.9833}

In [11]:
polarity_list = []
for x in kiva['DESCRIPTION_TRANSLATED']:
    dict_keys =  sentiment.polarity_scores(x).keys()
    dict_values = sentiment.polarity_scores(x).values()
    polarity_dict = dict(zip(dict_keys,dict_values))
    polarity_list.append(polarity_dict)

In [12]:
polarity_df = pd.DataFrame(polarity_list)

In [13]:
polarity_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.01,0.83,0.16,0.9833
1,0.025,0.84,0.135,0.9022
2,0.016,0.696,0.288,0.9747
3,0.024,0.881,0.095,0.7906
4,0.034,0.78,0.186,0.989


In [14]:
polarity_df['LOAN_ID'] = kiva['LOAN_ID']
polarity_df['STATUS'] = kiva['STATUS']
polarity_df.set_index('LOAN_ID', drop=True, inplace=True)

In [15]:
polarity_df.columns

Index(['neg', 'neu', 'pos', 'compound', 'STATUS'], dtype='object')

In [16]:
polarity_df['STATUS'].value_counts()

1    39671
0    11018
Name: STATUS, dtype: int64

In [17]:
polarity_df.describe()

Unnamed: 0,neg,neu,pos,compound,STATUS
count,50689.0,50689.0,50689.0,50689.0,50689.0
mean,0.017199,0.8324,0.150405,0.877308,0.782635
std,0.018573,0.065089,0.062073,0.196407,0.412457
min,0.0,0.586,0.0,-0.9705,0.0
25%,0.0,0.788,0.107,0.8625,1.0
50%,0.014,0.827,0.149,0.9412,1.0
75%,0.028,0.877,0.193,0.978,1.0
max,0.231,1.0,0.394,0.999,1.0


### Modeling

In [18]:
X = polarity_df.drop(columns='STATUS')
y = polarity_df['STATUS']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [20]:
# scale

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

### Run Through Classifiers

In [22]:
#create a list of the model classes to test
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    ExtraTreesClassifier(),
    BernoulliNB()
    
]

In [23]:
#create a list to store each model's results in a dictionary
classifier_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj

    #fit the model
    model.fit(X_train_sc, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_sc, y_train)
    results_dict['test_score'] = model.score(X_test_sc, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_sc, y_train, cv = 3).mean()
        
    #add the dictionary to the list
    classifier_list.append(results_dict)

#create a dataframe and display dictionary
desc_results = pd.DataFrame(classifier_list)
desc_results

Unnamed: 0,model_name,train_score,test_score,cv_score
0,LogisticRegression(),0.782644,0.782609,0.782644
1,DecisionTreeClassifier(),0.948601,0.680739,0.677662
2,BaggingClassifier(),0.938263,0.727373,0.716935
3,RandomForestClassifier(),0.948601,0.742839,0.739005
4,AdaBoostClassifier(),0.782565,0.782451,0.782539
5,ExtraTreesClassifier(),0.948601,0.721692,0.721959
6,BernoulliNB(),0.782644,0.782609,0.782644


In [24]:
#save polarity scores on local machine
polarity_df.to_csv(op_filename)