In [77]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import decomposition, ensemble
from spellchecker import SpellChecker
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from gensim import corpora

import pandas as pd 
import nltk as nltk
import  xgboost, numpy, string
import datetime as dt
import re as re
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

##### Functions

In [78]:
# Timer to check execution timing for each function call # 
def timer(start_time=None):
    if not start_time:
        start_time = dt.datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((dt.datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# Spelling checker # 
def spell_correct(array):
    spell = SpellChecker()
    for i in range(len(array)):
        array[i] = spell.correction(array[i])
    return array
    
def stem(array):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(w) for w in array]

def lemmetize(array):
    lemmatizer = WordNetLemmatizer() 
    return [lemmatizer.lemmatize(w) for w in array]

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # Fit the training dataset onto classifier #
    classifier.fit(feature_vector_train, label)
    # Predict the labels on validation dataset #
    predictions = classifier.predict(feature_vector_valid)
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    return metrics.accuracy_score(predictions, valid_y)

def predictions(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # Fit the training dataset onto classifier #
    classifier.fit(feature_vector_train, label)
    # Predict the labels on validation dataset #
    predictions = classifier.predict(feature_vector_valid)
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    return [classifier, predictions]

##### Data Prep 

In [81]:
# Data Prep# 
df = pd.read_csv("data/cleaned_hotelreviews_short.csv")

# Drop rows with null comments # 
df = df.dropna(subset=['reviews'])

# Remove stop words # 
stop_list = stopwords.words('english')
df['reviews'] = df['reviews'].apply(lambda x: [word for word in x.split() if word not in stop_list])

# Remove single words # 
df['reviews'] = df['reviews'].apply(lambda x: x if len(x) > 1 else [])

# Drop rows where reviews == [] # 
df = df[df.reviews.str.len()>0]

# Make words case-insensitive # 
df = df.apply(lambda x: x.astype(str).str.lower())

# Remove punctuations if any # 
df["words_only"] = df['reviews'].str.replace('[^\w\s]','')

# Tokenization with NLTK # 
start_time = timer(None)
df['tokenized'] = df['words_only'].apply(nltk.word_tokenize)
print("\nTokenizer: ",end="")
timer(start_time)

# Spelling checker # : Replace incorrect words with correct words 
#start_time = timer(None)
#df['corrected'] = df['tokenized'].apply(spell_correct)
#print("\nSpelling Correction: ",end="")
#timer(start_time)

# Stemming with NLTK # 
start_time = timer(None)
df['stemmed'] = df['tokenized'].apply(stem)
print("\nStemming: ",end="")
timer(start_time)

# Lemmetisation # 
start_time = timer(None)
lemmatizer = WordNetLemmatizer() 
df['lemmetized'] = df['stemmed'].apply(lemmetize)
print("\nLemmetization: ",end="")
timer(start_time)

# Turn arrays for each row in df['lemmetized'] into a string #: Needed to run SkLearn Lib
df['lemmetized'] = df['lemmetized'].apply(" ".join)

# Train - Test Split # 
start_time = timer(None)
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['lemmetized'], df['class'])
print("\nTrain-Test Split: ",end="")
timer(start_time)

# Label encode target variable to run ML models # 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


Tokenizer: 
 Time taken: 0 hours 0 minutes and 0.08 seconds.

Stemming: 
 Time taken: 0 hours 0 minutes and 0.16 seconds.

Lemmetization: 
 Time taken: 0 hours 0 minutes and 0.03 seconds.

Train-Test Split: 
 Time taken: 0 hours 0 minutes and 0.0 seconds.


##### Count vectorisation : Create vectors as features 
    # Every row represents a review 
    # Every column represents a term from the corpus 
    # Every cell represents the frequency count of the particular term in the particular review 

In [83]:
start_time = timer(None)

# Create count vectoriser object # 
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vector.fit(df['lemmetized'])

# Transform training and validation data # 
xtrain_count = count_vector.transform(train_x)
xvalid_count = count_vector.transform(valid_x)

print("Count Vectors:", end="")
timer(start_time)

Count Vectors:
 Time taken: 0 hours 0 minutes and 0.02 seconds.


### Convert to TF-IDF Vectors 


In [84]:
start_time = timer(None)

# Word Level TF-IDF #: Matrix represents tf-idf scores of every term in each review 
tfidf_word = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_word.fit(df['lemmetized'])
xtrain_tfidf_word = tfidf_word.transform(train_x)
xvalid_tfidf_word = tfidf_word.transform(valid_x)

print("Word Level TF-IDF", end="")
timer(start_time)

Word Level TF-IDF
 Time taken: 0 hours 0 minutes and 0.02 seconds.


In [85]:
start_time = timer(None)

# Unigram Level TF-IDF #: Matrix represents tf-idf scores of unigram (all terms are separate)
tfidf_unigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1), max_features=5000)
tfidf_unigram.fit(df['lemmetized'])
xtrain_tfidf_unigram = tfidf_unigram.transform(train_x)
xvalid_tfidf_unigram = tfidf_unigram.transform(valid_x)

# Bigram Level TF-IDF #: Terms are grouped together by twos 
tfidf_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_bigram.fit(df['lemmetized'])
xtrain_tfidf_bigram = tfidf_bigram.transform(train_x)
xvalid_tfidf_bigram = tfidf_bigram.transform(valid_x)

# Trigram Level TF-IDF #: Terms are grouped together in threes 
tfidf_trigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,3), max_features=5000)
tfidf_trigram.fit(df['lemmetized'])
xtrain_tfidf_trigram = tfidf_trigram.transform(train_x)
xvalid_tfidf_trigram = tfidf_trigram.transform(valid_x)

print("Unigram, Bigram, Trigram TF-IDF:", end="")
timer(start_time)

Unigram, Bigram, Trigram TF-IDF:
 Time taken: 0 hours 0 minutes and 0.08 seconds.


In [86]:
#print(xtrain_tfidf_unigram)

In [87]:
start_time = timer(None)

# Character Level TF-IDF #: Matrix represents tf-idf scores of character level uni, bi & tri-gram of all reviews
tfidf_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(1,3), max_features=5000)
tfidf_char.fit(df['lemmetized'])
xtrain_tfidf_char =  tfidf_char.transform(train_x) 
xvalid_tfidf_char =  tfidf_char.transform(valid_x) 

print("Character Level TF-IDF:", end="")
timer(start_time)

Character Level TF-IDF:
 Time taken: 0 hours 0 minutes and 0.13 seconds.


## XGBoost (Boosting)

In [88]:
# XGB Model # 
model = xgboost.XGBClassifier(max_depth=7,
                           min_child_weight=1,
                           learning_rate=0.2,
                           n_estimators=500,
                           silent=True,
                           objective='binary:logistic',
                           gamma=0,
                           max_delta_step=0,
                           subsample=1,
                           colsample_bytree=1,
                           colsample_bylevel=1,
                           reg_alpha=0,
                           reg_lambda=0,
                           scale_pos_weight=1,
                           seed=1,
                           missing=None,
                           tree_method='exact',
                           nthread=4)

# Params for hyperparameter grid search # 
params = {
        'max_depth': [5,7],
        'min_child_weight': [1, 5],
        'gamma': [0.5, 1]
        }

In [89]:
start_time = timer(None)
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)
timer(start_time)

start_time = timer(None)
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_word.tocsc(), train_y, xvalid_tfidf_word.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)
timer(start_time)

start_time = timer(None)
# Extereme Gradient Boosting on Unigram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_unigram.tocsc(), train_y, xvalid_tfidf_unigram.tocsc())
print ("Xgb, Unigram TF-IDF: ", accuracy)
timer(start_time)

start_time = timer(None)
# Extereme Gradient Boosting on Bigram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_bigram.tocsc(), train_y, xvalid_tfidf_bigram.tocsc())
print ("Xgb, Bigram TF-IDF: ", accuracy)
timer(start_time)

start_time = timer(None)
# Extereme Gradient Boosting on Trigram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_trigram.tocsc(), train_y, xvalid_tfidf_trigram.tocsc())
print ("Xgb, Trigram TF-IDF: ", accuracy)
timer(start_time)

start_time = timer(None)
# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_char.tocsc(), train_y, xvalid_tfidf_char.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)
timer(start_time)

Xgb, Count Vectors:  0.9757085020242915

 Time taken: 0 hours 0 minutes and 0.74 seconds.
Xgb, WordLevel TF-IDF:  0.9676113360323887

 Time taken: 0 hours 0 minutes and 0.76 seconds.
Xgb, Unigram TF-IDF:  0.9676113360323887

 Time taken: 0 hours 0 minutes and 0.75 seconds.
Xgb, Bigram TF-IDF:  0.8987854251012146

 Time taken: 0 hours 0 minutes and 1.82 seconds.
Xgb, Trigram TF-IDF:  0.8137651821862348

 Time taken: 0 hours 0 minutes and 0.62 seconds.
Xgb, CharLevel Vectors:  0.97165991902834

 Time taken: 0 hours 0 minutes and 1.62 seconds.


In [90]:
model = xgboost.XGBClassifier().fit(xtrain_count.tocsc(), train_y)

predictions = model.predict(xvalid_count.tocsc())

# Confusion matrix # 
confusion = confusion_matrix(valid_y, predictions)
class_report = classification_report(valid_y, predictions)

print('\nClasification report:\n', class_report)
print('\nConfussion matrix:\n', confusion)


Clasification report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       199
           1       1.00      0.88      0.93        48

    accuracy                           0.98       247
   macro avg       0.99      0.94      0.96       247
weighted avg       0.98      0.98      0.98       247


Confussion matrix:
 [[199   0]
 [  6  42]]


In [94]:
user_input = input("Enter review :")
user_input = user_input.split()

# Remove stop words # 
user_input = [word for word in user_input if word not in stop_list]

# Make words case-insensitive # 
user_input = [word.lower() for word in user_input]

# Remove punctuations if any # 
user_input = [re.sub('[^\w\s]','', word) for word in user_input]

# Spelling checker # : Replace incorrect words with correct words 
user_input = spell_correct(user_input)

# Stemming with NLTK # 
user_input = stem(user_input)

# Lemmetisation # 
user_input = lemmetize(user_input)

# Turn arrays for each row in df['lemmetized'] into a string #: Needed to run SkLearn Lib
user_input = " ".join(user_input)
user_input = pd.Series(user_input)

user_valid = valid_x.append(user_input, ignore_index=False).rename("lemmetized")
user_valid = user_valid.iloc[-1:]

#print(user_valid)

# Count Vector # 
user_count = count_vector.transform(user_input)

user_predictions = model.predict(user_count.tocsc())

if user_predictions == 0:
    print("Negative")
else:
    print("Positive")

Enter review :this area is really bad
Negative


## Hyperparameter Grid Search with 3 Fold Validation

In [13]:
start_time = timer(None)

# 3 fold validation with hyperparameter grid search #
folds = 3
param_comb = 3

kf = KFold(n_splits=folds, shuffle = False, random_state = None)

grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', n_jobs=3, cv=kf.split(xtrain_count,train_y), 
                    verbose=3 )

start_time = timer(None)

grid.fit(xtrain_count,train_y)

print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_)
print('\n Best parameters:')
print(grid.best_params_)

timer(start_time)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:   22.3s finished



 Best estimator:
XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.5, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.2, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=1, reg_alpha=0,
              reg_lambda=0, scale_pos_weight=1, seed=1, silent=True,
              subsample=1, tree_method='exact', validate_parameters=False,
              verbosity=None)

 Best score:
0.9546061415220294

 Best parameters:
{'gamma': 0.5, 'max_depth': 7, 'min_child_weight': 1}

 Time taken: 0 hours 0 minutes and 23.92 seconds.


### LDA Model 

In [14]:
# Train LDA Model # 
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vector.get_feature_names()

# View Topic Models # 
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

topic_summaries

['might suffici bit date look centr although clean nice bigger',
 'accessori toilet rate also bit breakfast room far includ seem',
 'bit far expens citi center centr away room station tram',
 'room check staff water tv recept two like bottl london',
 'air swim rooftop sunbath barra rel condition doubletre compet option',
 'screen bike bit user away center rent tv perfect take',
 'around edg knock tram environ renov produc forc complimentari will',
 'ad touch hotel matter bill bar even approach sm station',
 'onsit nice bar would back bathroom accessori gull fahrenheit summer',
 'bar would nice area plenti anyway baggi premis soap slightli',
 'big tea facil thank thought book even barcelona hotel stay',
 'bath would nice bathroom room shower addit expect made relax',
 'hotel beauti room staff great help breakfast locat stay bed',
 'factori often impress kind hotel small choic give one person',
 'bit far cost attract main high correspond tool may complex',
 'better would bigger room brea

### NAIVE BAYES 

In [15]:
# Assumes indepedence among predictors # 
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Unigram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("NB, Uni-Gram Vectors: ", accuracy)

# Naive Bayes on Bigram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("NB, Bi-Gram Vectors: ", accuracy)

# Naive Bayes on Trigram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("NB, Tri-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.94
NB, WordLevel TF-IDF:  0.904
NB, Uni-Gram Vectors:  0.904
NB, Bi-Gram Vectors:  0.82
NB, Tri-Gram Vectors:  0.808
NB, CharLevel Vectors:  0.852


## Linear Classifier (Logistic Regression) 

In [16]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Unigram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("LR, Uni-Gram Vectors: ", accuracy)

# Linear Classifier on Bigram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("LR, Bi-Gram Vectors: ", accuracy)

# Linear Classifier on Trigram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("LR, Tri-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.944
LR, WordLevel TF-IDF:  0.916
LR, Uni-Gram Vectors:  0.916
LR, Bi-Gram Vectors:  0.824
LR, Tri-Gram Vectors:  0.808
LR, CharLevel Vectors:  0.92




## SVM Model

In [17]:
start_time = timer(None)
# SVM on Count Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print ("SVM, Count Level Vectors: ", accuracy)
timer(start_time)

SVM, Count Level Vectors:  0.808

 Time taken: 0 hours 0 minutes and 0.03 seconds.




In [18]:
# Supervised ML Algo that extracts best possible hyper-plane/ line that segregates the two classes #

# SVM on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("SVM, Word Level Vectors: ", accuracy)

# SVM on Unigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("SVM, Uni-Gram Vectors: ", accuracy)

# SVM on Bigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("SVM, Bi-Gram Vectors: ", accuracy)

# SVM on Trigram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("SVM, Tri-Gram Vectors: ", accuracy)
       
# SVM on Char Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("SVM, CharLevel Vectors: ", accuracy)

SVM, Word Level Vectors:  0.804
SVM, Uni-Gram Vectors:  0.804
SVM, Bi-Gram Vectors:  0.804
SVM, Tri-Gram Vectors:  0.804




SVM, CharLevel Vectors:  0.804


## Random Forest (Bagging)

In [19]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_word, train_y, xvalid_tfidf_word)
print ("RF, Word Level Vectors: ", accuracy)

# RF on Unigram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram)
print ("RF, Uni-gram TF-IDF: ", accuracy)

# RF on Bigram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram)
print ("RF, Bi-gram TF-IDF: ", accuracy)

# RF on Trigram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_trigram, train_y, xvalid_tfidf_trigram)
print ("RF, Tri-gram TF-IDF: ", accuracy)

# RF on Char Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_char, train_y, xvalid_tfidf_char)
print ("RF, CharLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.94
RF, Word Level Vectors:  0.936
RF, Uni-gram TF-IDF:  0.92
RF, Bi-gram TF-IDF:  0.9




RF, Tri-gram TF-IDF:  0.816
RF, CharLevel TF-IDF:  0.924




##  Feature Importance XGB 
###### Can't plot important features if we only have one train column 

In [None]:
# Additional code for future reference # 
# Plot graph showing importance features, max = 50 features # 
importance = xgboost.XGBClassifier().feature_importances_ 
importance = pd.Series(importance, index=xtrain_count.columns)
importance.nlargest(50).plot(kind='barh')

# Select important features #
importance.sort_values(axis=0,ascending=False, inplace=True)
selected_features = importance.index[0:30].tolist()