In [1]:
#import required packages
#basics
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings

#stats
#from scipy.misc import imread
from scipy import sparse
import scipy.stats as ss

#viz
import matplotlib.pyplot as plt
import seaborn as sns


#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords

#import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer   

#FeatureEngineering
#!pip install lightgbm
#from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, decomposition, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import  textblob
#import xgboost
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

from textblob import TextBlob
from nltk.stem import PorterStemmer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from textblob import Word

#settings
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")
eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")

lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

%matplotlib inline


  import pandas.util.testing as tm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
data=pd.read_excel('C:\\Users\\hp\\Desktop\\analytixlabs\\Ml case study\\text mining bank review\\BankReviews.xlsx')

In [3]:
data.head()

Unnamed: 0,Date,Stars,Reviews,BankName
0,2017-04-10,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,2017-02-10,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,2017-08-21,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,2017-12-17,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,2016-05-27,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage


In [4]:
df=data[['Stars','Reviews']]

In [5]:
df

Unnamed: 0,Stars,Reviews
0,5,"Great job, Wyndham Capital! Each person was pr..."
1,5,Matthew Richardson is professional and helpful...
2,5,We had a past experience with Wyndham Mortgage...
3,5,We have been dealing with Brad Thomka from the...
4,5,I can't express how grateful I am for the supp...
...,...,...
500,1,\r\nI never write reviews but had to this time...
501,1,\r\nIt all started when Bob G ran a credit che...
502,1,\r\nWhat a horrible experience. We have excell...
503,1,"\r\nRep was extremely professional, friendly, ..."


In [6]:
df['Reviews'] = df['Reviews'].astype(str)
df['count_sent'] = df["Reviews"].apply(
    lambda x: len(re.findall("\n", str(x))) + 1)

#Word count in each comment:
df['count_word'] = df["Reviews"].apply(lambda x: len(str(x).split()))

#Unique word count
df['count_unique_word'] = df["Reviews"].apply(
    lambda x: len(set(str(x).split())))

#Letter count
df['count_letters'] = df["Reviews"].apply(lambda x: len(str(x)))

#Word density

df['word_density'] = df['count_letters'] / (df['count_word'] + 1)

#punctuation count
df["count_punctuations"] = df["Reviews"].apply(
    lambda x: len([c for c in str(x) if c in string.punctuation]))

#upper case words count
df["count_words_upper"] = df["Reviews"].apply(
    lambda x: len([w for w in str(x).split() if w.isupper()]))

#upper case words count
df["count_words_lower"] = df["Reviews"].apply(
    lambda x: len([w for w in str(x).split() if w.islower()]))

#title case words count
df["count_words_title"] = df["Reviews"].apply(
    lambda x: len([w for w in str(x).split() if w.istitle()]))

#Number of stopwords
df["count_stopwords"] = df["Reviews"].apply(
    lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

#Average length of the words
df["mean_word_len"] = df["Reviews"].apply(
    lambda x: np.mean([len(w) for w in str(x).split()]))

#Number of numeric
df['numeric'] = df['Reviews'].apply(
    lambda x: len([x for x in x.split() if x.isdigit()]))

#Number of alphanumeric
df['alphanumeric'] = df['Reviews'].apply(
    lambda x: len([x for x in x.split() if x.isalnum()]))

#Number of alphabetics
df['alphabetetics'] = df['Reviews'].apply(
    lambda x: len([x for x in x.split() if x.isalpha()]))

#Number of alphabetics
df['Spaces'] = df['Reviews'].apply(
    lambda x: len([x for x in x.split() if x.isspace()]))

#Number of Words ends with
df['words_ends_with_et'] = df['Reviews'].apply(
    lambda x: len([x for x in x.lower().split() if x.endswith('et')]))

#Number of Words ends with
df['words_start_with_no'] = df['Reviews'].apply(
    lambda x: len([x for x in x.lower().split() if x.startswith('no')]))

# Count the occurences of all words
df['wordcounts'] = df['Reviews'].apply(
    lambda x: dict([[t, x.split().count(t)] for t in set(x.split())]))

pos_family = {
    'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
    'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'adj': ['JJ', 'JJR', 'JJS'],
    'adv': ['RB', 'RBR', 'RBS', 'WRB']
}


# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = Reviewsblob.ReviewsBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt


df['noun_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['Reviews'].apply(lambda x: check_pos_tag(x, 'pron'))

In [7]:
df.head()

Unnamed: 0,Stars,Reviews,count_sent,count_word,count_unique_word,count_letters,word_density,count_punctuations,count_words_upper,count_words_lower,...,alphabetetics,Spaces,words_ends_with_et,words_start_with_no,wordcounts,noun_count,verb_count,adj_count,adv_count,pron_count
0,5,"Great job, Wyndham Capital! Each person was pr...",1,19,19,126,6.3,4,0,14,...,15,0,0,0,"{'Great': 1, 'move': 1, 'you!': 1, 'job,': 1, ...",0,0,0,0,0
1,5,Matthew Richardson is professional and helpful...,1,25,23,159,6.115385,4,0,20,...,21,0,0,0,"{'much': 1, 'correct': 1, 'find': 1, 'Matthew'...",0,0,0,0,0
2,5,We had a past experience with Wyndham Mortgage...,1,79,64,462,5.775,8,0,64,...,73,0,0,1,"{'to': 1, 'Wyndham!!': 1, 'highly': 1, 'Lind':...",0,0,0,0,0
3,5,We have been dealing with Brad Thomka from the...,1,108,78,605,5.550459,9,0,88,...,101,0,0,1,"{'to': 2, 'servicer': 1, 'started': 1, 'pulled...",0,0,0,0,0
4,5,I can't express how grateful I am for the supp...,1,59,47,341,5.683333,6,3,50,...,53,0,0,1,"{'am': 1, 'to': 3, 'demeanor': 1, 'during': 1,...",0,0,0,0,0


# Calculating Sentiment analysis using Textblob module

In [8]:
df['sentiment'] = df["Reviews"].apply(lambda x: TextBlob(x).sentiment.polarity)

In [9]:
df[['sentiment','Stars','Reviews']]

Unnamed: 0,sentiment,Stars,Reviews
0,0.533333,5,"Great job, Wyndham Capital! Each person was pr..."
1,0.453333,5,Matthew Richardson is professional and helpful...
2,-0.033231,5,We had a past experience with Wyndham Mortgage...
3,0.093740,5,We have been dealing with Brad Thomka from the...
4,0.125000,5,I can't express how grateful I am for the supp...
...,...,...,...
500,0.122289,1,\r\nI never write reviews but had to this time...
501,0.139815,1,\r\nIt all started when Bob G ran a credit che...
502,0.071667,1,\r\nWhat a horrible experience. We have excell...
503,0.176042,1,"\r\nRep was extremely professional, friendly, ..."


In [10]:
data.Stars.value_counts()

5    410
1     95
Name: Stars, dtype: int64

In [11]:
X = data.Reviews
y = data.Stars

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(378,)
(127,)
(378,)
(127,)


In [13]:
df.head()

Unnamed: 0,Stars,Reviews,count_sent,count_word,count_unique_word,count_letters,word_density,count_punctuations,count_words_upper,count_words_lower,...,Spaces,words_ends_with_et,words_start_with_no,wordcounts,noun_count,verb_count,adj_count,adv_count,pron_count,sentiment
0,5,"Great job, Wyndham Capital! Each person was pr...",1,19,19,126,6.3,4,0,14,...,0,0,0,"{'Great': 1, 'move': 1, 'you!': 1, 'job,': 1, ...",0,0,0,0,0,0.533333
1,5,Matthew Richardson is professional and helpful...,1,25,23,159,6.115385,4,0,20,...,0,0,0,"{'much': 1, 'correct': 1, 'find': 1, 'Matthew'...",0,0,0,0,0,0.453333
2,5,We had a past experience with Wyndham Mortgage...,1,79,64,462,5.775,8,0,64,...,0,0,1,"{'to': 1, 'Wyndham!!': 1, 'highly': 1, 'Lind':...",0,0,0,0,0,-0.033231
3,5,We have been dealing with Brad Thomka from the...,1,108,78,605,5.550459,9,0,88,...,0,0,1,"{'to': 2, 'servicer': 1, 'started': 1, 'pulled...",0,0,0,0,0,0.09374
4,5,I can't express how grateful I am for the supp...,1,59,47,341,5.683333,6,3,50,...,0,0,1,"{'am': 1, 'to': 3, 'demeanor': 1, 'during': 1,...",0,0,0,0,0,0.125


In [14]:
def clean_text(text):
    text = text.lower()   #Text into lower case.
    text = text.strip()     #Remove spaces at the beginning and at the end of the string aka. Removing whitespaces.
    text = re.sub(r' +', ' ', text)  #Replace substrings
    text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", text)
    return (text)

In [15]:
stop = set(nltk.corpus.stopwords.words('english'))

nltk.corpus.stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
import string
def pre_process(text):
    #text = text.str.replace('/','')                           #Replacing the / with none
    #text = text.apply(lambda x: re.sub("  "," ", x))          #Replacing double space with single space
    #text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,']", "", text)      #Replacing special character with none
    #text = re.sub(r'[0-9]+', '', text)                        #Replacing numbers with none
    #text = text.apply(lambda x: " ".join(x.translate(str.maketrans('', '', string.punctuation)) for x in x.split() if x.isalpha()))
    text = text.apply(lambda x: " ".join(x for x in x.split() if x not in stop)) #Removing stop words
    #text = text.apply(lambda x: str(TextBlob(x).correct()))                      #Correct spelling corrections
    #text = text.apply(lambda x: " ".join(PorterStemmer().stem(word) for word in x.split())) #Stemming using porter stemmer
    #text = text.apply(lambda x: " ".join(stemmer_func(word) for word in x.split()))        #Stemming
    #text = text.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))   #lemmatization
    #text = text.apply(lambda x: " ".join(word for word, pos in pos_tag(x.split()) if pos not in ['NN','NNS','NNP','NNPS'])) #Removing nouns etc
    return(text)

In [17]:
X_train = X_train.apply(lambda x: clean_text(x))
X_test = X_test.apply(lambda x: clean_text(x))

In [18]:
X_train=pre_process(X_train)
X_test=pre_process(X_test)

# Vectorization (Count, Tfidf, Hashing)
    - Charter level
    - Word level
    - n-grams

In [19]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             ngram_range=(1, 1 ), 
                             min_df=5, 
                             encoding='latin-1' ,
                             max_features=800)
xtrain_count = count_vect.fit_transform(X_train)

In [20]:
xtrain_count

<378x596 sparse matrix of type '<class 'numpy.int64'>'
	with 9211 stored elements in Compressed Sparse Row format>

In [21]:
dtm=xtrain_count.toarray()

In [22]:
dtm

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
count_vect.get_feature_names()

['aaron',
 'able',
 'absolutely',
 'accept',
 'accommodating',
 'account',
 'accurate',
 'across',
 'actual',
 'adam',
 'additional',
 'advice',
 'agent',
 'agreed',
 'alex',
 'almost',
 'along',
 'already',
 'also',
 'always',
 'amazing',
 'american',
 'amount',
 'another',
 'answer',
 'answered',
 'answering',
 'answers',
 'anyone',
 'anything',
 'application',
 'apply',
 'appraisal',
 'appraiser',
 'appreciate',
 'appreciated',
 'approved',
 'around',
 'ask',
 'asked',
 'asking',
 'aspects',
 'attention',
 'available',
 'away',
 'awesome',
 'back',
 'bad',
 'balance',
 'bank',
 'banks',
 'barrett',
 'based',
 'became',
 'beginning',
 'beneficial',
 'best',
 'better',
 'beyond',
 'big',
 'bob',
 'brent',
 'broker',
 'business',
 'buy',
 'buyer',
 'buyers',
 'buying',
 'ca',
 'call',
 'called',
 'calling',
 'calls',
 'calm',
 'came',
 'cannot',
 'cant',
 'capital',
 'care',
 'causing',
 'certainly',
 'change',
 'changed',
 'check',
 'chose',
 'chris',
 'circumstances',
 'clear',
 'cli

In [24]:
dtm1=pd.DataFrame(dtm)

In [25]:
dtm1.columns=count_vect.get_feature_names()

In [26]:
dtm1.head()

Unnamed: 0,aaron,able,absolutely,accept,accommodating,account,accurate,across,actual,adam,...,working,works,would,wouldnt,writing,wrong,wyndham,year,years,yet
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Vectorization (count, tfidf) for both train & test

In [27]:
#Train
count_vect = CountVectorizer(analyzer='word',
                             token_pattern=r'\w{1,}',
                             ngram_range=(1, 1),
                             min_df=5,
                             encoding='latin-1',
                             max_features=800)

xtrain_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(xtrain_count)

#Test
#count_vect = CountVectorizer()
xtest_count = count_vect.transform(X_test)

#tfidf_transformer = TfidfTransformer()
X_test_tfidf = tfidf_transformer.transform(xtest_count)

In [69]:
xtest_count

<127x596 sparse matrix of type '<class 'numpy.int64'>'
	with 3089 stored elements in Compressed Sparse Row format>

In [28]:
dtm2=pd.DataFrame(X_train_tfidf.toarray(), columns=count_vect.get_feature_names())

In [29]:
dtm2.head(10)

Unnamed: 0,aaron,able,absolutely,accept,accommodating,account,accurate,across,actual,adam,...,working,works,would,wouldnt,writing,wrong,wyndham,year,years,yet
0,0.0,0.0,0.0,0.133259,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.123737,0.0,0.0,0.0,0.0,0.09625,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.194406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.130702,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.121363,0.0,0.0,0.0,0.0,0.094403,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.084774,0.175643,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.256938,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.068074,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.228774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',
                                   token_pattern='\w{1,}',
                                   ngram_range=(1, 2),
                                   max_features=800)
tfidf_vect_ngram.fit(df['Reviews'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(X_train)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(X_test)

In [31]:
xtrain_tfidf_ngram

<378x800 sparse matrix of type '<class 'numpy.float64'>'
	with 8051 stored elements in Compressed Sparse Row format>

In [32]:
xtest_tfidf_ngram

<127x800 sparse matrix of type '<class 'numpy.float64'>'
	with 2835 stored elements in Compressed Sparse Row format>

In [33]:
dtm3=pd.DataFrame(xtrain_tfidf_ngram.toarray(), columns= tfidf_vect_ngram.get_feature_names())

In [34]:
dtm3

Unnamed: 0,1,10,2,3,4,5,a,a few,a great,a home,...,would recommend,wouldn,wouldn t,wyndham,year,years,you,you are,you can,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.112647,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.109433,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [35]:
def sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [36]:
sentiment=X_train.apply(lambda x:sentiment(x))

In [37]:
a=pd.concat([X_train,sentiment],axis=1)
a.columns=['review','sentimnet']
a

Unnamed: 0,review,sentimnet
501,started bob g ran credit check without knowled...,positive
172,great website knowledgeable responsive always ...,positive
80,pleasure working robert first call kept well i...,positive
46,started bob g ran credit check without knowled...,positive
318,agree star review easy clear work hello knowle...,positive
...,...,...
255,vanessa word fantastic every step way perfect ...,positive
72,fast easy leave comforts home steven shatz fri...,positive
396,friend mine told refinanced house unbelievable...,positive
235,worked jon barrett processing refinance loan q...,positive


In [38]:
a.sentimnet.value_counts()

positive    338
negative     28
neutral      12
Name: sentimnet, dtype: int64

In [39]:
#positive reviews
a[a.sentimnet=='positive']

Unnamed: 0,review,sentimnet
501,started bob g ran credit check without knowled...,positive
172,great website knowledgeable responsive always ...,positive
80,pleasure working robert first call kept well i...,positive
46,started bob g ran credit check without knowled...,positive
318,agree star review easy clear work hello knowle...,positive
...,...,...
255,vanessa word fantastic every step way perfect ...,positive
72,fast easy leave comforts home steven shatz fri...,positive
396,friend mine told refinanced house unbelievable...,positive
235,worked jon barrett processing refinance loan q...,positive


In [40]:
#negative reviews
a[a.sentimnet=='negative']

Unnamed: 0,review,sentimnet
93,use reliance first capitoldo let kenneth watso...,negative
364,jason chandler team worked hard ensure able pu...,negative
98,based results give star initial conversations ...,negative
91,team aware critical information delayed refina...,negative
59,new federal regulations made tedious time cons...,negative
157,applied three different lenders two came acros...,negative
411,aweful experience terms service followup overa...,negative
88,wesley white worked tirelessly patiently get l...,negative
492,miserable experience screwed everything imagin...,negative
401,bad,negative


In [41]:
#neutral reviews
a[a.sentimnet=='neutral']

Unnamed: 0,review,sentimnet
39,closing process va loan went smoothly loan ser...,neutral
236,rate relockin rule followed,neutral
286,bob triumph beat lenders rates prompt approval...,neutral
70,informative responsive,neutral
408,lending tree matching companies dont offer pro...,neutral
417,called asked city nh looking told might consid...,neutral
219,closing process va loan went smoothly loan ser...,neutral
97,person spoke kept interrupting telling line ra...,neutral
60,informative responsive,neutral
74,teddy represents company well communicative pa...,neutral


# Create user defined function for train the models

In [42]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid,
                valid_y):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    return metrics.accuracy_score(classifier.predict(feature_vector_train),
                                  label), metrics.accuracy_score(
                                      predictions, valid_y)

# Building different models with different vectors

In [43]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  1   5]
 [ 70 308]]


In [44]:

from imblearn.over_sampling import RandomOverSampler

In [45]:
ros = RandomOverSampler(random_state=123)

X_train_tfidf_os, y_train_tfidf_os = ros.fit_sample(X_train_tfidf, y_train)

X_train_cnt_os, y_train_cnt_os = ros.fit_sample(xtrain_count, y_train)

X_train_tfidf_ngram_os, y_train_tfidf_ngram_os = ros.fit_sample(xtrain_tfidf_ngram, y_train)

unique_elements, counts_elements = np.unique(y_train_tfidf_os, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  1   5]
 [308 308]]


In [46]:
#Naive Bayes
# Naive Bayes on TF-IDF
accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_os, y_train_tfidf_os, X_test_tfidf, y_test)
print("NB  for L1, Count Vectors: ", accuracy_L1)

# Naive Bayes on Word Level TF IDF Vectors
accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_cnt_os, y_train_cnt_os, xtest_count, y_test)
print("NB  for L1, WordLevel TF-IDF: ", accuracy_L1)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy_L1 = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram_os, y_train_tfidf_ngram_os, xtest_tfidf_ngram, y_test)
print("NB  for L1, N-Gram Vectors: ", accuracy_L1)

NB  for L1, Count Vectors:  (0.9756493506493507, 0.952755905511811)
NB  for L1, WordLevel TF-IDF:  (0.9724025974025974, 0.9606299212598425)
NB  for L1, N-Gram Vectors:  (0.9675324675324676, 0.9448818897637795)


In [47]:
#Logistic Regression
# Logistic Regression on Count Vectors and TF-IDF
accuracy_L1 = train_model(LogisticRegression(), X_train_tfidf_os, y_train_tfidf_os, X_test_tfidf, y_test)
print("LR  for L1, Count Vectors: ", accuracy_L1)



# Logistic Regression on Word Level TF IDF Vectors
accuracy_L1 = train_model(LogisticRegression(), X_train_cnt_os, y_train_cnt_os, xtest_count, y_test)
print("LR  for L1, WordLevel TF-IDF: ", accuracy_L1)



# Logistic Regression on Ngram Level TF IDF Vectors
accuracy_L1 = train_model(LogisticRegression(), X_train_tfidf_ngram_os, y_train_tfidf_ngram_os, xtest_tfidf_ngram, y_test)
print("LR  for L1, N-Gram Vectors: ", accuracy_L1)

LR  for L1, Count Vectors:  (0.9837662337662337, 0.968503937007874)
LR  for L1, WordLevel TF-IDF:  (0.9983766233766234, 0.9763779527559056)
LR  for L1, N-Gram Vectors:  (0.9756493506493507, 0.9606299212598425)


In [48]:
#Linear SVC
# Linear SVC on Count Vectors and TF-IDF
accuracy_L1 = train_model(SVC(), X_train_tfidf_os, y_train_tfidf_os,
                          X_test_tfidf, y_test)
print("SVC  for L1, Count Vectors: ", accuracy_L1)

# Linear SVC on Word Level TF IDF Vectors
accuracy_L1 = train_model(SVC(), X_train_cnt_os, y_train_cnt_os, xtest_count,
                          y_test)
print("SVC  for L1, WordLevel TF-IDF: ", accuracy_L1)

# Linear SVC on Ngram Level TF IDF Vectors
accuracy_L1 = train_model(SVC(), X_train_tfidf_ngram_os,
                          y_train_tfidf_ngram_os, xtest_tfidf_ngram, y_test)
print("SVC  for L1, N-Gram Vectors: ", accuracy_L1)

SVC  for L1, Count Vectors:  (0.9983766233766234, 0.937007874015748)
SVC  for L1, WordLevel TF-IDF:  (0.9756493506493507, 0.9448818897637795)
SVC  for L1, N-Gram Vectors:  (0.9983766233766234, 0.9448818897637795)


# we are getting best score from logistic regression. so we will chose this for predictions


In [55]:
lgm=LogisticRegression().fit( X_train_cnt_os, y_train_cnt_os)

In [56]:
lgm.score( X_train_cnt_os, y_train_cnt_os)

0.9983766233766234

In [57]:
test_pred = lgm.predict(xtest_count)

In [58]:
test_pred

array([5, 5, 1, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 1, 5,
       5, 1, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 1, 5, 5, 5,
       1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5,
       1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 1,
       5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5, 1, 5, 5, 1, 5], dtype=int64)

In [65]:
pred = pd.DataFrame( { 'actual':  y_test,
                                'predicted':test_pred} )

In [66]:
pred

Unnamed: 0,actual,predicted
307,5,5
343,5,5
47,1,1
67,5,5
361,5,5
...,...,...
41,1,1
360,5,5
289,5,5
497,1,1


In [67]:
from sklearn.metrics import accuracy_score
print(metrics.accuracy_score(pred.actual, pred.predicted))
print(metrics.roc_auc_score(pred.actual, pred.predicted))

0.9763779527559056
0.94


In [68]:
metrics.confusion_matrix(y_pred.predicted, y_pred.actual)

array([[ 22,   0],
       [  3, 102]], dtype=int64)

# we are getting accuracy of 97% in the predictions from the logistic regression