In [87]:
# Import Libraries # 
import pandas as pd 
import nltk as nltk
import  xgboost, numpy, string
import datetime as dt
import re as re

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import decomposition, ensemble
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from gensim import corpora

In [88]:
# Define Functions # 

# Timer to check execution timing for each function call # 
def timer(start_time=None):
    if not start_time:
        start_time = dt.datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((dt.datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def stem(array):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(w) for w in array]

def lemmetize(array):
    lemmatizer = WordNetLemmatizer() 
    return [lemmatizer.lemmatize(w) for w in array]

In [89]:
# Baseline Data Prep # 
df = pd.read_csv("data/cleaned_hotelreviews_short.csv")

# Drop rows with null comments # 
df = df.dropna(subset=['reviews'])

# Make words case-insensitive # 
df = df.apply(lambda x: x.astype(str).str.lower())

In [90]:
# Run baseline models based on Count Vector # 

# Train - Test Split # 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['reviews'], df['class'], train_size=0.8, random_state = 3000)

# Label encode target variable # 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# Create Count Vector #  
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vector.fit(df['reviews'])

# Transform training and validation data # 
xtrain_count = count_vector.transform(train_x)
xvalid_count = count_vector.transform(valid_x)

# XG Boost on Unprocessed Data # 
model = xgboost.XGBClassifier().fit(xtrain_count.tocsc(), train_y)
predictions = model.predict(xvalid_count.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_y)
print("XGBoost Classifier Accuracy: ", accuracy)

# Logistic Regression on Unprocessed Data # 
model = linear_model.LogisticRegression().fit(xtrain_count, train_y)
predictions = model.predict(xvalid_count)
accuracy = metrics.accuracy_score(predictions, valid_y)
print("Logistic Regression Accuracy: ", accuracy)

# SMV on Unprocessed Data # 
model = svm.SVC().fit(xtrain_count, train_y)
predictions = model.predict(xvalid_count)
accuracy = metrics.accuracy_score(predictions, valid_y)
print("SVM Accuracy: ", accuracy)

# Random Forest on Unprocessed Data # 
model  = ensemble.RandomForestClassifier().fit(xtrain_count, train_y)
predictions = model.predict(xvalid_count)
accuracy = metrics.accuracy_score(predictions, valid_y)
print("Random Forest Accuracy: ", accuracy)

# Naive Bayes on Unprocessed Data # 
model = naive_bayes.MultinomialNB().fit(xtrain_count, train_y)
predictions = model.predict(xvalid_count)
accuracy = metrics.accuracy_score(predictions, valid_y)
print("Naive Bayes Accuracy: ", accuracy)

XGBoost Classifier Accuracy:  0.945
Logistic Regression Accuracy:  0.95
SVM Accuracy:  0.76
Random Forest Accuracy:  0.915
Naive Bayes Accuracy:  0.935




In [91]:
# Full Data Prep on two best models # 

# Remove punctuations if any # 
df["words_only"] = df['reviews'].str.replace('[^\w\s]','')

# Remove stop words # 
stop_list = stopwords.words('english')
df['reviews'] = df['reviews'].apply(lambda x: [word for word in x.split() if word not in stop_list])

# Remove single words # 
df['reviews'] = df['reviews'].apply(lambda x: x if len(x) > 1 else [])

# Drop rows where reviews == [] # 
df = df[df.reviews.str.len()>0]

# Tokenization with NLTK # 
df['tokenized'] = df['words_only'].apply(nltk.word_tokenize)

# Stemming with NLTK # 
df['stemmed'] = df['tokenized'].apply(stem)

# Turn arrays for each row in df['stemmed'] into a string #: Needed to run SkLearn Lib
df['stemmed'] = df['stemmed'].apply(" ".join)

# Lemmetisation # 
lemmatizer = WordNetLemmatizer() 
df['lemmetized'] = df['tokenized'].apply(lemmetize)

# Turn arrays for each row in df['lemmetized'] into a string #: Needed to run SkLearn Lib
df['lemmetized'] = df['lemmetized'].apply(" ".join)

In [92]:
# Train - Test Split for Stemmed Words # 
train_stemx, valid_stemx, train_stemy, valid_stemy = model_selection.train_test_split(df['stemmed'], df['class'], train_size=0.8, random_state = 3000)

# Label encode target variable [STEMMED] # 
train_stemy = encoder.fit_transform(train_stemy)
valid_stemy = encoder.fit_transform(valid_stemy)

# Train - Test Split for Lemmetized Words # 
train_lemx, valid_lemx, train_lemy, valid_lemy = model_selection.train_test_split(df['lemmetized'], df['class'], train_size=0.8, random_state = 3000)

# Label encode target variable [LEMMETIZED] # 
train_lemy = encoder.fit_transform(train_lemy)
valid_lemy = encoder.fit_transform(valid_lemy)

In [93]:
# Best Models [STEMMED] # 

# Create Count Vector [STEMMED] #  
count_vector.fit(df['stemmed'])
xtrain_count_stem = count_vector.transform(train_stemx)
xvalid_count_stem = count_vector.transform(valid_stemx)

# Model 1 # 
model = _____________.fit(xtrain_count_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_count_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_count_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_count_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 2: ", accuracy)

# Ngram Level TF-IDF #: Matrix represents tf-idf scores of unigram (all terms are separate)
tfidf_unigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1), max_features=5000)
tfidf_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_trigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,3), max_features=5000)
tfidf_quadgram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(4,4), max_features=5000)

# Unigram [STEMMED] # 
tfidf_unigram.fit(df['stemmed'])
xtrain_tfidf_unigram_stem = tfidf_unigram.transform(train_stemx)
xvalid_tfidf_unigram_stem = tfidf_unigram.transform(valid_stemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_unigram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_unigram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_unigram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_unigram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 2: ", accuracy)

# Bigram [STEMMED] # 
tfidf_bigram.fit(df['stemmed'])
xtrain_tfidf_bigram_stem = tfidf_bigram.transform(train_stemx)
xvalid_tfidf_bigram_stem = tfidf_bigram.transform(valid_stemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_bigram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_bigram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_bigram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_bigram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 2: ", accuracy)

# Trigram [STEMMED] # 
tfidf_trigram.fit(df['stemmed'])
xtrain_tfidf_trigram_stem = tfidf_trigram.transform(train_stemx)
xvalid_tfidf_trigram_stem = tfidf_trigram.transform(valid_stemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_trigram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_trigram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_bigram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_bigram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 2: ", accuracy)

# Quadgram [STEMMED] # 
tfidf_quadgram.fit(df['stemmed'])
xtrain_tfidf_quadgram_stem = tfidf_quadgram.transform(train_stemx)
xvalid_tfidf_quadgram_stem = tfidf_quadgram.transform(valid_stemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_quadgram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_quadgram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_quadgram_stem.tocsc(), train_stemy)
predictions = model.predict(xvalid_tfidf_quadgram_stem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_stemy)
print("Model 2: ", accuracy)

NameError: name '_____________' is not defined

In [94]:
# Best Models [LEMMETIZED] # 

# Create Count Vector [LEMMETIZED] #  
count_vector.fit(df['lemmetized'])
xtrain_count_lem = count_vector.transform(train_lemx)
xvalid_count_lem = count_vector.transform(valid_lemx)

# Model 1 # 
model = _____________.fit(xtrain_count_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_count_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_count_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_count_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 2: ", accuracy)

# Ngram Level TF-IDF #: Matrix represents tf-idf scores of unigram (all terms are separate)
tfidf_unigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1), max_features=5000)
tfidf_bigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=5000)
tfidf_trigram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(3,3), max_features=5000)
tfidf_quadgram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(4,4), max_features=5000)

# Unigram [LEMMETIZED] # 
tfidf_unigram.fit(df['lemmetized'])
xtrain_tfidf_unigram_lem = tfidf_unigram.transform(train_lemx)
xvalid_tfidf_unigram_lem = tfidf_unigram.transform(valid_lemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_unigram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_unigram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_unigram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_unigram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 2: ", accuracy)

# Bigram [LEMMETIZED] # 
tfidf_bigram.fit(df['lemmetized'])
xtrain_tfidf_bigram_lem = tfidf_bigram.transform(train_lemx)
xvalid_tfidf_bigram_lem = tfidf_bigram.transform(valid_lemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_bigram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_bigram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_bigram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_bigram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 2: ", accuracy)

# Trigram [LEMMETIZED] # 
tfidf_trigram.fit(df['lemmetized'])
xtrain_tfidf_trigram_lem = tfidf_trigram.transform(train_lemx)
xvalid_tfidf_trigram_lem = tfidf_trigram.transform(valid_lemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_trigram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_trigram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_trigram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_trigram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 2: ", accuracy)

# Quadgram [LEMMETIZED] # 
tfidf_quadgram.fit(df['lemmetized'])
xtrain_tfidf_quadgram_lem = tfidf_quadgram.transform(train_lemx)
xvalid_tfidf_quadgram_lem = tfidf_quadgram.transform(valid_lemx)

# Model 1 # 
model = _____________.fit(xtrain_tfidf_quadgram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_quadgram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 1: ", accuracy)

# Model 2 # 
model = _____________.fit(xtrain_tfidf_quadgram_lem.tocsc(), train_lemy)
predictions = model.predict(xvalid_tfidf_quadgram_lem.tocsc())
accuracy = metrics.accuracy_score(predictions, valid_lemy)
print("Model 2: ", accuracy)

NameError: name '_____________' is not defined

In [None]:
# Saving Model for Future Use, Pickle # 
import pickle 

# Save the model to disk # 
filename1 = 'model_1.sav'
pickle.dump(filename1, open(filename1, 'wb'))

filename2 = 'model_2.sav'
pickle.dump(filename2, open(filename2, 'wb'))


In [None]:
# Loading Model # 
load_model_1 = pickle.load(open(filename1, 'rb'))
load_model_2 = pickle.load(open(filename2, 'rb'))

#result = load_model_1.score(X_test, Y_test)



In [108]:
user_input = input("Enter review :")
user_input = user_input.split()

# Remove stop words # 
user_input = [word for word in user_input if word not in stop_list]

# Make words case-insensitive # 
user_input = [word.lower() for word in user_input]

# Remove punctuations if any # 
user_input = [re.sub('[^\w\s]','', word) for word in user_input]


# Spelling checker # : Replace incorrect words with correct words 
#user_input = spell_correct(user_input)

# Stemming with NLTK # 
user_input = stem(user_input)

# Lemmetisation # 
user_input = lemmetize(user_input)

# Turn arrays for each row in df['lemmetized'] into a string #: Needed to run SkLearn Lib
user_input = " ".join(user_input)
user_input = pd.Series(user_input)

print(user_input)

# Count Vector # 
user_count = count_vector.transform(user_input)

print(user_count)

user_predictions = model.predict(user_count.tocsc())

if user_predictions == 0:
    print("Negative")
else:
    print("Positive")

Enter review :hello this place sucks
0    hello place suck
dtype: object
  (0, 1308)	1


ValueError: dimension mismatch