# Sentiment Analysis project

In [294]:
#importing dataset and libraries

In [295]:
import numpy as np
import pandas as pd

In [296]:
df = pd.read_csv('hotel_review_train.csv')

In [297]:
df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [298]:
df.iloc[1,1]

"I stayed at the Crown Plaza April -- - April --, ----. The staff was friendly and attentive. The elevators are tiny (about -' by -'). The food in the restaurant was delicious but priced a little on the high side. Of course this is Washington DC. There is no pool and little for children to do. My room on the fifth floor had two comfortable beds and plenty of space for one person. The TV is a little small by todays standards with a limited number of channels. There was a small bit of mold in the bathtub area that could have been removed with a little bleach. It appeared the carpets were not vacummed every day. I reported a light bulb was burned out. It was never replaced. Ice machines are on the odd numbered floors, but the one on my floor did not work. I encountered some staff in the elevator one evening and I mentioned the ice machine to them. Severel hours later a maid appeared at my door with ice and two mints. I'm not sure how they knew what room I was in. That was a little unnervi

In [299]:
df.drop(columns=['User_ID','Browser_Used','Device_Used'],inplace = True)

In [300]:
df.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [301]:
len(df)

38932

In [302]:
df.shape

(38932, 2)

In [303]:
df['Is_Response'].value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

# Data cleaning and processing

In [304]:
#renaming the columns
df.rename(columns= {'Description': 'Review', 'Is_Response':'label'},inplace=True)

In [305]:
df.head()

Unnamed: 0,Review,label
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [306]:
# checking any null values
df.isnull().sum()

Review    0
label     0
dtype: int64

In [307]:
blanks = []

for i,rev,lab in df.itertuples():
    if type(rev) == str:
        if rev.isspace():
            blanks.append(i)

In [308]:
blanks # not any empty string

[]

In [309]:
mapdict = {'not happy': 'neg', 'happy': 'pos'}

df['label'] = df['label'].map(mapdict)

In [310]:
df.head()

Unnamed: 0,Review,label
0,The room was kind of clean but had a VERY stro...,neg
1,I stayed at the Crown Plaza April -- - April -...,neg
2,I booked this hotel through Hotwire at the low...,neg
3,Stayed here with husband and sons on the way t...,pos
4,My girlfriends and I stayed here to celebrate ...,neg


# cleaning the text using regular expressions

In [311]:
df['Review'][0]

"The room was kind of clean but had a VERY strong smell of dogs. Generally below average but ok for a overnight stay if you're not too fussy. Would consider staying again if the price was right. Breakfast was free and just about better than nothing."

In [312]:
# removing punctuations
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [313]:
def remove_punctuation(text):
    text_nopunct = "".join([i for i in text if i not in string.punctuation]) # join method returns string
    return text_nopunct

In [314]:
df['no_punct'] = df['Review'].apply(lambda x: remove_punctuation(x))

In [315]:
df.head()

Unnamed: 0,Review,label,no_punct
0,The room was kind of clean but had a VERY stro...,neg,The room was kind of clean but had a VERY stro...
1,I stayed at the Crown Plaza April -- - April -...,neg,I stayed at the Crown Plaza April April Th...
2,I booked this hotel through Hotwire at the low...,neg,I booked this hotel through Hotwire at the low...
3,Stayed here with husband and sons on the way t...,pos,Stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,neg,My girlfriends and I stayed here to celebrate ...


The join() method is a string method and returns a string in which the elements of sequence have been joined by str separator.

The join() method takes iterable – objects capable of returning its members one at a time. Some examples are List, Tuple, String, Dictionary and Set

In [316]:
# removing numbers and converting the text into lower case
#re.sub() is used to replace substrings in strings
import re

def clean_text(text):
    text = text.lower()
    text = re.sub("\d+","",text)
    return text

In [317]:
df['cleaned_text'] = df['no_punct'].apply(lambda x: clean_text(x))

In [318]:
df.head()

Unnamed: 0,Review,label,no_punct,cleaned_text
0,The room was kind of clean but had a VERY stro...,neg,The room was kind of clean but had a VERY stro...,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,neg,I stayed at the Crown Plaza April April Th...,i stayed at the crown plaza april april th...
2,I booked this hotel through Hotwire at the low...,neg,I booked this hotel through Hotwire at the low...,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,pos,Stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,neg,My girlfriends and I stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...


In [319]:
# removing unneccessary column and renaming it
df.drop(columns=['Review','no_punct'],inplace =True)

In [320]:
df.head()

Unnamed: 0,label,cleaned_text
0,neg,the room was kind of clean but had a very stro...
1,neg,i stayed at the crown plaza april april th...
2,neg,i booked this hotel through hotwire at the low...
3,pos,stayed here with husband and sons on the way t...
4,neg,my girlfriends and i stayed here to celebrate ...


In [321]:
#renaming the column
df.rename(columns= {'cleaned_text':'Review'},inplace =True)

In [322]:
df.head()

Unnamed: 0,label,Review
0,neg,the room was kind of clean but had a very stro...
1,neg,i stayed at the crown plaza april april th...
2,neg,i booked this hotel through hotwire at the low...
3,pos,stayed here with husband and sons on the way t...
4,neg,my girlfriends and i stayed here to celebrate ...


# tokenizing and removing stop words using spacy


In [323]:
# tokenizing and removing stop words using spacy
from spacy.lang.en import English

nlp = English()

def token_func(text):
    my_doc = nlp(text)
    #creating list of word tokens
   
    token_list = [token.text for token in my_doc]
        
    return token_list
    



In [324]:
df['with_tokens'] = df['Review'].apply(lambda x: token_func(x))

In [325]:
df.head()

Unnamed: 0,label,Review,with_tokens
0,neg,the room was kind of clean but had a very stro...,"[the, room, was, kind, of, clean, but, had, a,..."
1,neg,i stayed at the crown plaza april april th...,"[i, stayed, at, the, crown, plaza, april, , ..."
2,neg,i booked this hotel through hotwire at the low...,"[i, booked, this, hotel, through, hotwire, at,..."
3,pos,stayed here with husband and sons on the way t...,"[stayed, here, with, husband, and, sons, on, t..."
4,neg,my girlfriends and i stayed here to celebrate ...,"[my, girlfriends, and, i, stayed, here, to, ce..."


In [326]:
# removing the stopwords
from spacy.lang.en.stop_words import STOP_WORDS

def removing_stopwords(text):
    list_without_stopwords = [i for i in text if nlp.vocab[i].is_stop == False]
    return list_without_stopwords

In [327]:
# first review without removing stopwords
" ".join(df['with_tokens'][0])

'the room was kind of clean but had a very strong smell of dogs generally below average but ok for a overnight stay if you re not too fussy would consider staying again if the price was right breakfast was free and just about better than nothing'

In [328]:
df['without_stopwords'] = df['with_tokens'].apply(lambda x:removing_stopwords(x))

In [329]:
df.head()

Unnamed: 0,label,Review,with_tokens,without_stopwords
0,neg,the room was kind of clean but had a very stro...,"[the, room, was, kind, of, clean, but, had, a,...","[room, kind, clean, strong, smell, dogs, gener..."
1,neg,i stayed at the crown plaza april april th...,"[i, stayed, at, the, crown, plaza, april, , ...","[stayed, crown, plaza, april, , april, , s..."
2,neg,i booked this hotel through hotwire at the low...,"[i, booked, this, hotel, through, hotwire, at,...","[booked, hotel, hotwire, lowest, price, find, ..."
3,pos,stayed here with husband and sons on the way t...,"[stayed, here, with, husband, and, sons, on, t...","[stayed, husband, sons, way, alaska, cruise, l..."
4,neg,my girlfriends and i stayed here to celebrate ...,"[my, girlfriends, and, i, stayed, here, to, ce...","[girlfriends, stayed, celebrate, th, birthdays..."


In [330]:
# first review after removing stopwords
" ".join(df['without_stopwords'][0])

'room kind clean strong smell dogs generally average ok overnight stay fussy consider staying price right breakfast free better'

# lemmatization

In contrast to stemming, lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a morphological analysis to words. The lemma of 'was' is 'be' and the lemma of 'mice' is 'mouse'. Further, the lemma of 'meeting' might be 'meet' or 'meeting' depending on its use in a sentence.

In [331]:
import nltk
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sonys\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [332]:
def lemmatization(token_text):
    text = [wn.lemmatize(word) for word in token_text]
    return text

In [333]:
df['lemmatized_list'] = df['without_stopwords'].apply(lambda x:lemmatization(x))

In [334]:
df.head()

Unnamed: 0,label,Review,with_tokens,without_stopwords,lemmatized_list
0,neg,the room was kind of clean but had a very stro...,"[the, room, was, kind, of, clean, but, had, a,...","[room, kind, clean, strong, smell, dogs, gener...","[room, kind, clean, strong, smell, dog, genera..."
1,neg,i stayed at the crown plaza april april th...,"[i, stayed, at, the, crown, plaza, april, , ...","[stayed, crown, plaza, april, , april, , s...","[stayed, crown, plaza, april, , april, , s..."
2,neg,i booked this hotel through hotwire at the low...,"[i, booked, this, hotel, through, hotwire, at,...","[booked, hotel, hotwire, lowest, price, find, ...","[booked, hotel, hotwire, lowest, price, find, ..."
3,pos,stayed here with husband and sons on the way t...,"[stayed, here, with, husband, and, sons, on, t...","[stayed, husband, sons, way, alaska, cruise, l...","[stayed, husband, son, way, alaska, cruise, lo..."
4,neg,my girlfriends and i stayed here to celebrate ...,"[my, girlfriends, and, i, stayed, here, to, ce...","[girlfriends, stayed, celebrate, th, birthdays...","[girlfriend, stayed, celebrate, th, birthday, ..."


In [335]:
# dropping necessary columns
df.drop(columns = ['Review','with_tokens','without_stopwords'],inplace =True)

In [336]:
def converting_stg(text):
    string = ' '.join(text)
    return string

In [337]:
df['converted_str'] = df['lemmatized_list'].apply(lambda x: converting_stg(x))

In [338]:
df.head()

Unnamed: 0,label,lemmatized_list,converted_str
0,neg,"[room, kind, clean, strong, smell, dog, genera...",room kind clean strong smell dog generally ave...
1,neg,"[stayed, crown, plaza, april, , april, , s...",stayed crown plaza april april staff fri...
2,neg,"[booked, hotel, hotwire, lowest, price, find, ...",booked hotel hotwire lowest price find got des...
3,pos,"[stayed, husband, son, way, alaska, cruise, lo...",stayed husband son way alaska cruise loved hot...
4,neg,"[girlfriend, stayed, celebrate, th, birthday, ...",girlfriend stayed celebrate th birthday planne...


In [339]:
df.drop(columns= 'lemmatized_list',inplace=True)

In [340]:
# renaming
df.rename(columns= {'converted_str': 'Review'},inplace =True)

In [341]:
# Now that we have performed all the steps in text cleaning, we can go for model building
df['label'].value_counts()

pos    26521
neg    12411
Name: label, dtype: int64

# Spliting the data into train & test sets:

In [342]:
from sklearn.model_selection import train_test_split

X = df['Review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Building a pipeline to vectorize the date, then train and fit a model

In [343]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

# Running predictions and analyzing the results

In [344]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [345]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[3091  996]
 [ 686 8075]]


In [346]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.82      0.76      0.79      4087
         pos       0.89      0.92      0.91      8761

    accuracy                           0.87     12848
   macro avg       0.85      0.84      0.85     12848
weighted avg       0.87      0.87      0.87     12848



In [347]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.8690846824408468
