# Sentiment Analysis

In [31]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction import text

#modeling
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

In [32]:
# #read in data on colab
# kiva = pd.read_csv('/content/drive/MyDrive/Colab/group_project/cleaned_nlp.csv')

#read in data on drive
df = pd.read_csv('data/cleaned_nlp.csv').drop(columns=['LOAN_USE', 'TAGS'])

In [33]:
df.head()

Unnamed: 0,LOAN_ID,DESCRIPTION_TRANSLATED,STATUS
0,1455352,The city of Portoviejo is located in the valle...,1
1,1727469,"Lorna is a married woman, 39 years old with fi...",0
2,1747998,Anita is a 32-year-old married woman residing ...,1
3,1342372,"Saeeda is a 45-year-old woman, living with her...",1
4,1632606,Pablo is an enterprising young man who has the...,0


### Remove Special Characters

In [34]:
df['DESCRIPTION_TRANSLATED'] = df['DESCRIPTION_TRANSLATED'].replace('[^\w ]','',regex=True).astype(str) 

In [35]:
df['DESCRIPTION_TRANSLATED'].head()

0    The city of Portoviejo is located in the valle...
1    Lorna is a married woman 39 years old with fiv...
2    Anita is a 32yearold married woman residing in...
3    Saeeda is a 45yearold woman living with her sp...
4    Pablo is an enterprising young man who has the...
Name: DESCRIPTION_TRANSLATED, dtype: object

### Grab a Sample

In [36]:
kiva = df.sample(n=150_000, replace=False, random_state=42).reset_index(drop=True)

kiva['STATUS'].value_counts()

1    125052
0     24948
Name: STATUS, dtype: int64

### Set up Sentiment Analyzer

In [37]:
# Instantiate Sentiment Intensity Analyzer
sentiment = SentimentIntensityAnalyzer()

In [38]:
kiva['DESCRIPTION_TRANSLATED'][0]

'This is Chelangat a dairy farmer in Kericho She is married with children She has been involved in a women group where they get to learn on better animal husbandry She has been keeping animals as a source of income and she sells the milk produce to a cooperatives in her village The income she receives she uses it to educate her children and provide for her household rnrnShe wants to expand her dairy farm and with a Kiva LOAN she will buy one cow and build a cow shed'

In [39]:
sentiment.polarity_scores(kiva['DESCRIPTION_TRANSLATED'][0])

{'neg': 0.0, 'neu': 0.94, 'pos': 0.06, 'compound': 0.6369}

In [40]:
polarity_list = []
for x in kiva['DESCRIPTION_TRANSLATED']:
    dict_keys =  sentiment.polarity_scores(x).keys()
    dict_values = sentiment.polarity_scores(x).values()
    polarity_dict = dict(zip(dict_keys,dict_values))
    polarity_list.append(polarity_dict)

In [41]:
polarity_df = pd.DataFrame(polarity_list)

In [42]:
polarity_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.94,0.06,0.6369
1,0.0,0.839,0.161,0.9371
2,0.0,0.727,0.273,0.9741
3,0.0,0.874,0.126,0.9231
4,0.0,0.881,0.119,0.7964


In [43]:
polarity_df['LOAN_ID'] = kiva['LOAN_ID']
polarity_df['STATUS'] = kiva['STATUS']
polarity_df.set_index('LOAN_ID', drop=True, inplace=True)

In [44]:
polarity_df.columns

Index(['neg', 'neu', 'pos', 'compound', 'STATUS'], dtype='object')

In [45]:
polarity_df['STATUS'].value_counts()

1    125052
0     24948
Name: STATUS, dtype: int64

In [46]:
polarity_df.describe()

Unnamed: 0,neg,neu,pos,compound,STATUS
count,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.017104,0.854421,0.128472,0.804411,0.83368
std,0.021326,0.060276,0.057582,0.268131,0.372369
min,0.0,0.531,0.0,-0.9975,0.0
25%,0.0,0.817,0.089,0.7717,1.0
50%,0.012,0.858,0.123,0.9002,1.0
75%,0.026,0.897,0.164,0.9555,1.0
max,0.296,1.0,0.469,0.9998,1.0


### Modeling

In [47]:
X = polarity_df.drop(columns='STATUS')
y = polarity_df['STATUS']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [49]:
# scale

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

### Run Through Classifiers

In [50]:
#create a list of the model classes to test
model_list = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    BaggingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    SVC(),
    BernoulliNB()
    
]

In [51]:
#create a list to store each model's results in a dictionary
classifier_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj

    #fit the model
    model.fit(X_train_sc, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_sc, y_train)
    results_dict['test_score'] = model.score(X_test_sc, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_sc, y_train, cv = 3).mean()
        
    #add the dictionary to the list
    classifier_list.append(results_dict)

#create a dataframe and display dictionary
desc_results = pd.DataFrame(classifier_list)
desc_results

Unnamed: 0,model_name,train_score,test_score,cv_score
0,LogisticRegression(),0.83368,0.83368,0.83368
1,DecisionTreeClassifier(),0.953707,0.73552,0.732853
2,BaggingClassifier(),0.944613,0.773147,0.772418
3,RandomForestClassifier(),0.953707,0.793947,0.79336
4,AdaBoostClassifier(),0.83368,0.83368,0.83368
5,SVC(),0.833698,0.83368,0.83368
6,BernoulliNB(),0.83368,0.83368,0.83368


In [56]:
#save polarity scores on local machine
polarity_df.to_csv('data/sentiment.csv')