# NLP Models with Tf-IDF Vectorizer
This notebook has more nlp models with using Tf-IDF Vectorizer. It also has some analysis and models with the TAGS column explicitly.

### Importing Libraries & Loading Data

In [156]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

In [2]:
kiva = pd.read_csv('~/dsi/Group-Project/kiva_sample.csv')

In [3]:
kiva.head()

Unnamed: 0,STATUS,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,all_text
0,1,"Nora Luz is 55 years old, is separated from he...",to buy merchandise.,"#Single, #Elderly, user_favorite, #Woman-Owned...",nora luz 55 year old separated husband life di...
1,0,Steven is a married man with four kids and is ...,to add cattle to his farm.,"#Parent, #Animals, #Schooling, #Biz Durable Asset",steven married man four kid resident sembabule...
2,0,María is 32 years old and she lives in a priva...,"to buy fresh cheese, pork scratchings, corn, b...","#Parent, #Woman-Owned Business, user_favorite",maría 32 year old life privately owned house h...
3,0,Benon is a hardworking 47-year-old married man...,"to buy pipes, tiles, etc. to complete construc...","user_favorite, #Health and Sanitation, #Repair...",benon hardworking 47 year old married man six ...
4,1,"Kousar is a 37-year-old lady, living with her ...",to buy raw material for embroidery such as thr...,"#Fabrics, #Parent, #Woman-Owned Business",kousar 37 year old lady living spouse area mul...


In [3]:
# dropping empty row 
kiva[kiva['all_text'].isnull()]

Unnamed: 0,STATUS,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,all_text
1545,1,,,,


In [4]:
kiva.drop(index=1545, inplace=True)

In [6]:
kiva.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199337 entries, 0 to 199337
Data columns (total 5 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   STATUS                  199337 non-null  int64 
 1   DESCRIPTION_TRANSLATED  199337 non-null  object
 2   LOAN_USE                199337 non-null  object
 3   TAGS                    199337 non-null  object
 4   all_text                199337 non-null  object
dtypes: int64(1), object(4)
memory usage: 9.1+ MB


## Modeling w/Tf-IDF Vectorizer

In [5]:
X = kiva['all_text']
y = kiva['STATUS']

In [7]:
y.value_counts(normalize=True)

1    0.834672
0    0.165328
Name: STATUS, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [8]:
tvec = TfidfVectorizer()
X_train_tvec = tvec.fit_transform(X_train)
X_test_tvec = tvec.transform(X_test)

In [9]:
X_train_tvec.shape

(149502, 88097)

In [10]:
X_test_tvec.shape

(49835, 88097)

In [19]:
# setting params for potential gridsearches
tv_params = {
    'tv__ngram_range': [(1,1), (1,2)],
    'tv__max_df': [0.9, 0.95, 1.0],
}

### Multinomial NB

In [11]:
# instantiating and fitting model
nb = MultinomialNB()
nb.fit(X_train_tvec, y_train)

MultinomialNB()

In [13]:
# scoring model
print(f'Train score: {nb.score(X_train_tvec, y_train)}')
print(f'Test score: {nb.score(X_test_tvec, y_test)}')

Train score: 0.8381025003009993
Test score: 0.8351359486304806


### AdaBoost

In [134]:
# instantiating and fitting model
ada = AdaBoostClassifier()
ada.fit(X_train_tvec, y_train)

AdaBoostClassifier()

In [135]:
# scoring model
print(f'Train score: {ada.score(X_train_tvec, y_train)}')
print(f'Test score: {ada.score(X_test_tvec, y_test)}')

Train score: 0.8323500688953994
Test score: 0.8328684659375941


### GradientBoost

In [145]:
# instantiating and fitting model
gb = GradientBoostingClassifier()
gb.fit(X_train_tvec, y_train)

GradientBoostingClassifier()

In [146]:
# scoring model
print(f'Train score: {gb.score(X_train_tvec, y_train)}')
print(f'Test score: {gb.score(X_test_tvec, y_test)}')

Train score: 0.8394670305413975
Test score: 0.8386274706531555


Each model used with default parameters did not improve the r2_score beyond the null model.

## Tag Analysis
Testing models to see if keeping all words from tag together following vectorizing improves the model.

In [63]:
kiva['TAGS'][0]

'#Single, #Elderly, user_favorite, #Woman-Owned Business, user_favorite, #Interesting Photo, #Repeat Borrower'

In [65]:
# testing process to keep all words from tag following vectorizing
test_replace = kiva['TAGS'][0].replace(' ', '')
test_replace

'#Single,#Elderly,user_favorite,#Woman-OwnedBusiness,user_favorite,#InterestingPhoto,#RepeatBorrower'

In [66]:
# tokenizing test
tokenizer = RegexpTokenizer('\w+|\$[\d.]+|S+')
test_token = [tokenizer.tokenize(test_replace.lower())]

In [67]:
test_token

[['single',
  'elderly',
  'user_favorite',
  'woman',
  'ownedbusiness',
  'user_favorite',
  'interestingphoto',
  'repeatborrower']]

In [74]:
# casting TAGS column to string to avoid erros in processing
kiva['TAGS'] = kiva['TAGS'].astype(str)

In [85]:
# removing spaces within tags and creating new column w/changes to TAGS
kiva['processed_TAGS'] = [row.replace(' ', '') for row in kiva['TAGS']]

In [87]:
# removing dashes within tags 
kiva['processed_TAGS'] = [row.replace('-', '') for row in kiva['processed_TAGS']]

In [89]:
# tokenizing all rows
kiva['processed_TAGS'] = [tokenizer.tokenize(row.lower()) for row in kiva['processed_TAGS']] 

In [91]:
kiva['processed_TAGS'].head()

0    [single, elderly, user_favorite, womanownedbus...
1        [parent, animals, schooling, bizdurableasset]
2          [parent, womanownedbusiness, user_favorite]
3    [user_favorite, healthandsanitation, repairren...
4                [fabrics, parent, womanownedbusiness]
Name: processed_TAGS, dtype: object

In [92]:
# rejoining list of words in each row
kiva['processed_TAGS'] = [' '.join(row) for row in kiva['processed_TAGS']]

In [93]:
kiva['processed_TAGS'][0]

'single elderly user_favorite womanownedbusiness user_favorite interestingphoto repeatborrower'

### Tag Modeling

In [94]:
# defining data variables
X_sk = kiva['processed_TAGS']
y_sk = kiva['STATUS']

In [97]:
y.value_counts(normalize=True)

1    0.834672
0    0.165328
Name: STATUS, dtype: float64

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X_sk, y_sk, stratify=y, random_state=42)

In [119]:
# vectorizing data
tv = TfidfVectorizer()
X_train_tv = tv.fit_transform(X_train)
X_test_tv = tv.transform(X_test)

In [103]:
tv_params

{'tv__ngram_range': [(1, 1), (1, 2)], 'tv__max_df': [0.9, 0.95, 1.0]}

In [136]:
# instantiating and fitting logistic regression model
lr = LogisticRegression(max_iter=1000, C=0.1)
lr.fit(X_train_tv, y_train)

LogisticRegression(C=0.1, max_iter=1000)

In [100]:
# scoring logistic regression
print(f'Train score: {lr.score(X_train_tv, y_train)}')
print(f'Test score: {lr.score(X_test_tv, y_test)}')

Train score: 0.8334069109443352
Test score: 0.8333299889635798


In [137]:
# scoring logistic regression w/changes in C 
print(f'Train score: {lr.score(X_train_tv, y_train)}')
print(f'Test score: {lr.score(X_test_tv, y_test)}')

Train score: 0.8334804885553371
Test score: 0.8337513795525233


No real changes in scores following change of ngram_range and max_df in the vectorizer as well as changing the C value in lr.

### MultinomialNB

In [132]:
# instantiating and fitting multinomialNB 
nb2 = MultinomialNB(alpha=100)
nb2.fit(X_train_tv, y_train)

MultinomialNB(alpha=100)

In [102]:
# scoring model
print(f'Train score: {nb2.score(X_train_tv, y_train)}')
print(f'Test score: {nb2.score(X_test_tv, y_test)}')

Train score: 0.8346711080788217
Test score: 0.8346744256044948


In [133]:
# scoring model
print(f'Train score: {nb2.score(X_train_tv, y_train)}')
print(f'Test score: {nb2.score(X_test_tv, y_test)}')

Train score: 0.8346711080788217
Test score: 0.8346744256044948


No real changes in scores even after changing hyperparameters in vectorizer and MultinomialNB. Scores still in line with null model.

### Bernoulli

In [138]:
bnb = BernoulliNB()
bnb.fit(X_train_tv, y_train)

BernoulliNB()

In [139]:
# scoring model
print(f'Train score: {bnb.score(X_train_tv, y_train)}')
print(f'Test score: {bnb.score(X_test_tv, y_test)}')

Train score: 0.8193669649904349
Test score: 0.821129728102739


### AdaBoost

In [140]:
ada = AdaBoostClassifier()
ada.fit(X_train_tv, y_train)

AdaBoostClassifier()

In [141]:
# scoring model
print(f'Train score: {ada.score(X_train_tv, y_train)}')
print(f'Test score: {ada.score(X_test_tv, y_test)}')

Train score: 0.8321427138098487
Test score: 0.8329085983746363


### Random Forest

In [143]:
rfc = RandomForestClassifier()
rfc.fit(X_train_tv, y_train)

RandomForestClassifier()

In [147]:
# scoring model
print(f'Train score: {rfc.score(X_train_tv, y_train)}')
print(f'Test score: {rfc.score(X_test_tv, y_test)}')

Train score: 0.8562159703549116
Test score: 0.8299187318149894


In [148]:
kiva.head(2)

Unnamed: 0,STATUS,DESCRIPTION_TRANSLATED,LOAN_USE,TAGS,all_text,processed_TAGS
0,1,"Nora Luz is 55 years old, is separated from he...",to buy merchandise.,"#Single, #Elderly, user_favorite, #Woman-Owned...",nora luz 55 year old separated husband life di...,single elderly user_favorite womanownedbusines...
1,0,Steven is a married man with four kids and is ...,to add cattle to his farm.,"#Parent, #Animals, #Schooling, #Biz Durable Asset",steven married man four kid resident sembabule...,parent animals schooling bizdurableasset


In [149]:
kiva['all_textntags'] = (kiva['DESCRIPTION_TRANSLATED']+kiva['LOAN_USE']+kiva['processed_TAGS'])

In [151]:
kiva['all_textntags'][0]

'Nora Luz is 55 years old, is separated from her husband, and lives in the district of Cayaltí in the Chiclayo province of the Lambayeque department of Perú.  She has many good references, both personal and for work.  The residents of this town mostly make a living in agriculture, commerce, and livestock.<br /><br />She makes a living selling costume jewelry, and has several years of experience in this field.  Her business is located in her home.  She is asking for the loan in order to buy merchandise.  In this way, she will be able to continue working and getting her own income.<br /><br />Nora Luz has loan experience; she is a recurrent member of Edpyme Alternativa.to buy merchandise.single elderly user_favorite womanownedbusiness user_favorite interestingphoto repeatborrower'

In [152]:
# removing html breaks
kiva['all_textntags'] = kiva['all_textntags'].str.replace('<br />', ' ')

In [154]:
# Removing punctuation from text
tokenizer = RegexpTokenizer('\w+|\$[\d.]+|S+')
kiva['all_textntags'] = [tokenizer.tokenize(row.lower()) for row in kiva['all_textntags']]

In [157]:
# removing s's from ends of words to be able to count singulars and plurals together
lemmatizer = WordNetLemmatizer()
kiva['all_textntags'] = [[lemmatizer.lemmatize(token) for token in token_list] for token_list in kiva['all_textntags']]

In [158]:
# removing most common english words from text
kiva['all_textntags'] = [[token for token in token_list if token not in stopwords.words('english')] for token_list in kiva['all_textntags']]

In [159]:
# rejoining list of words in each row
kiva['all_textntags'] = [' '.join(row) for row in kiva['all_textntags']]

In [160]:
kiva.to_csv('/Users/precious/dsi/Group-Project/kiva_connected_tags.csv', index=False)

In [162]:
# looking for duplicate posts
kiva['all_textntags'].isnull().sum()

0

In [163]:
Xt = kiva['all_textntags']
yt = kiva['STATUS']

In [164]:
X_train, X_test, y_train, y_test = train_test_split(Xt, yt, stratify=y, random_state=42)

In [165]:
tidf = TfidfVectorizer()
X_train_tidf = tidf.fit_transform(X_train)
X_test_tidf = tidf.transform(X_test)

In [168]:
#create a list of the model classes to test
model_list = [
    LogisticRegression(max_iter=1000),
    GradientBoostingClassifier(),
    MultinomialNB(),
    AdaBoostClassifier()
]

In [169]:
#create a list to store each model's results in a dictionary, adapted from RInsler code
classifier_list = []

for model_obj in model_list:
    #instantiate each model 
    model = model_obj

    #fit the model
    model.fit(X_train_tidf, y_train) 
  
    #create a dictionary with scores and evaluation metrics for each model
    results_dict = {}    
    results_dict['model_name'] = str(model_obj)
    results_dict['train_score'] = model.score(X_train_tidf, y_train)
    results_dict['test_score'] = model.score(X_test_tidf, y_test)
    results_dict['cv_score'] = cross_val_score(model, X_train_tidf, y_train, cv = 3).mean()
        
    #add the dictionary to the list
    classifier_list.append(results_dict)

#create a dataframe and display dictionary
results = pd.DataFrame(classifier_list)
results

Unnamed: 0,model_name,train_score,test_score,cv_score
0,LogisticRegression(max_iter=1000),0.860644,0.846112,0.842771
1,GradientBoostingClassifier(),0.839092,0.838527,0.837815
2,MultinomialNB(),0.837781,0.835216,0.834671
3,AdaBoostClassifier(),0.831822,0.832066,0.830591
