In [1]:
##########################################
# --- Importing the standard libraries ---
##########################################

import numpy as np
import pandas as pd
import re
import nltk
from textblob import TextBlob

In [2]:
################################
# --- Importing the Dataset ---
################################

data = pd.read_csv('../Prepared Data/Hotel_Reviews.csv')
data.head()

Unnamed: 0,Doc_ID,Review,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,OVERALL_RATING
0,china_beijing_holiday_inn_central_plaza,"[""Just about everything about this hotel is fa...",4.786,4.631,4.733,3.553,4.699,4.481
1,china_beijing_hilton_beijing_wangfujing,"['An excellent hotel, with the best room I hav...",4.81,4.845,4.759,4.828,4.517,4.752
2,china_beijing_hotel_g,"['It was chic, everyone was friendly, service ...",4.769,4.75,4.577,4.375,4.654,4.625
3,china_beijing_the_regent_beijing,"[""My parents and I stayed here during their vi...",4.625,4.812,4.438,4.646,4.531,4.61
4,china_beijing_the_st_regis_beijing,['this hotel was fantastic. rooms were lovely....,4.846,4.646,4.615,4.492,4.185,4.557


---

## Extracting the Features (Using Stopwords Removal and Lemmatization)

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [13]:
#########################################
# --- Function to Extract the Reviews ---
#########################################

##> Description
# Loops through all the reviews
# Apply's the following things:-
#        1. Removing all the symbols, dots, commas, etc. which are not required as features
#        2. Converting the alphabets to lower case
#        3. Splits the review into list of words
#        4. Removes the stopwords like pronouns, determiners, etc.
#        5. Appends all the clean reviews into a corpus list



def extract_features(data):
    corpus = []
    
    for i in range(0, len(data)):
        
        review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])    
        review = review.lower()                                 
        review = review.split()                                 
        
        lmtzr = WordNetLemmatizer()
        review = [lmtzr.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
        
        review = ' '.join(review)
        
        corpus.append(review)
    return corpus


corpus = extract_features(data)

In [None]:
#########################
# --- Save our Corpus ---
#########################

with open('../Corpus/corpus.txt', 'a') as f:
    for line in corpus:
        f.write(line)
        f.write('\n')

---

## Extracting the Features (Using Parts of Speech (POS) Tagging)

In [None]:
#######################################
# --- Parts of Speech (POS) Tagging ---
#######################################
##> Extracting only Nouns and Adjectives which are the most describing words
# Appends a list of of tuples containing only 'NN' or 'JJ' into tag_list

def pos_extract(blob):
    tags_list = []
    for i in blob.tags:
        if (i[1] == 'NN') or (i[1] == 'JJ'):
            tags_list.append(i)
    return tags_list

In [None]:
#########################################
# --- Function to Extract the Reviews ---
#########################################

##> Description
# Loops through all the reviews
# Apply's the following things:-
#        1. Extract the POS tags for each review
#        2. Storing the nouns and adjectives into a review list
#        3. Appends all the extracted reviews into a corpus list



def extract_features_2(data):
    corpus = []
    
    for i in range(0, len(data)):
        
        review = []
        
        blob = TextBlob(data['Review'][i])
        
        NN_JJ_tags = pos_extract(blob)
        
        for word in NN_JJ_tags:
            review.append(word[0])
        
        review = ' '.join(review)
        
        corpus.append(review)
        
    return corpus


corpus = extract_features_2(data)

In [None]:
#########################
# --- Save our Corpus ---
#########################

with open('../Corpus/corpus2.txt', 'a') as f:
    for line in corpus:
        f.write(line)
        f.write('\n')

---

## Extracting the Features (Using Noun Phrases Extraction)

In [None]:
#################################
# --- Noun Phrases Extraction ---
#################################
##> Noun Phrase consists of (Det)(Adj)NN(PP)


def extract_features_3(data):
    corpus = []
    
    for i in range(0, len(data)):
        
        review = []
        
        blob = TextBlob(data['Review'][i])
        
        noun_phrase_list = list(blob.noun_phrases)
        
        for word in noun_phrase_list:
            review.append(word)
        
        review = ' '.join(review)
        
        corpus.append(review)
        
    return corpus


corpus = extract_features_3(data)

In [None]:
#########################
# --- Save our Corpus ---
#########################

with open('../Corpus/corpus3.txt', 'a') as f:
    for line in corpus:
        f.write(line)
        f.write('\n')

---

## Extracting the Features (Using Bi-Grams and N-Grams)

In [None]:
##############################
# --- Bi-Grams and N-Grams ---
##############################

# bi_grams = blob.ngrams(n=2)
# n_grams = blob.ngrams(n=3)

def extract_features_4(data, n):
    corpus = []
    
    for i in range(0, len(data)):
        
        review = []
        
        blob = TextBlob(data['Review'][i])
        
        bi_grams = blob.ngrams(n=n)
        
        for word in bi_grams:
            word = ' '.join(word)
            review.append(word)
        
        review = ' '.join(review)
        
        corpus.append(review)
        
    return corpus



corpus = extract_features_4(data, n=2)                # For Bi-Grams
# corpus = extract_features_4(data, n=3)                # For Tri-Grams

In [None]:
#########################
# --- Save our Corpus ---
#########################

with open('../Corpus/corpus4.txt', 'a') as f:
    for line in corpus:
        f.write(line)
        f.write('\n')

---