In [46]:
##########################################
# --- Importing the standard libraries ---
##########################################

import pandas as pd

In [6]:
################################
# --- Importing the Dataset ---
################################

data = pd.read_csv('../Prepared Data/Dataset.csv')
data.head()

Unnamed: 0,Doc_ID,City,Country,Num_Reviews,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,OVERALL_RATING
0,china_beijing_holiday_inn_central_plaza,beijing,China,206,4.786,4.631,4.733,3.553,4.699,4.481
1,china_beijing_hilton_beijing_wangfujing,beijing,China,58,4.81,4.845,4.759,4.828,4.517,4.752
2,china_beijing_hotel_g,beijing,China,104,4.769,4.75,4.577,4.375,4.654,4.625
3,china_beijing_the_regent_beijing,beijing,China,96,4.625,4.812,4.438,4.646,4.531,4.61
4,china_beijing_the_st_regis_beijing,beijing,China,65,4.846,4.646,4.615,4.492,4.185,4.557


---

In [12]:
################################################################
# --- Function to get reviews from city and docID of a hotel ---
################################################################

##-> Function Description
# Accepts a hotel doc_ID and city
# Returns a dataFrame with hotel reviews of that particular doc_ID and city


def getHotel_reviews(docID, city):
    reviews = pd.read_csv('../Raw Data/{0}/{1}'.format(city, docID), delimiter='\t', 
                          names=['Date', 'Review_Title', 'Review'], encoding='ISO-8859-14', 
                          usecols=range(3))
    return reviews

In [44]:
getHotel_reviews('china_beijing_holiday_inn_central_plaza', 'beijing')[100:105]

Unnamed: 0,Date,Review_Title,Review
100,Oct 13 2008,Wonderful Hotel Wonderful Storm,After reading the reviews for this hotel and t...
101,Oct 11 2008,Fantastic Beijing. Wonderful Hotel. Highly Rec...,"We loved everything about this hotel, from the..."
102,Oct 3 2008,Great Hotel Outstanding Service,Everything and more than what the web postings...
103,Sep 24 2008,Yes it's as good as all the other reviewers he...,Stayed for two nights - little to add to the g...
104,Sep 18 2008,Excellent hotel,This is deffinatly the best hotel I have stayd...


---

In [45]:
###############################################################
# --- Function to extract reviews and save it to a DataFrame ---
###############################################################

##-> Function Description
# It creates a list of combined reviews from each docID
# returns a DataFrame of Doc_ID and Review


def extract_reviews():
    
    review_list = []
    
    for city in list(data.City.unique()):
        for docID in list(data[data.City == city].Doc_ID):
            try:
                review = str(list(getHotel_reviews(docID, city)['Review']))
                review_list.append([docID, review])
            except:
                review_list.append([docID, 'No review'])
    
    df = pd.DataFrame(review_list, columns=['Doc_ID', 'Review'])

    return df


reviews_frame = extract_reviews() 

# len(reviews_frame)    # 2821 rows

---

In [47]:
###############################################################
#--- Merging the two data frames to prepare our final frame ---
###############################################################

ratings_frame = data[['Doc_ID', 'CLEANLINESS', 'ROOM', 'SERVICE', 'LOCATION', 'VALUE', 'OVERALL_RATING']]

combined_frame = pd.merge(reviews_frame, ratings_frame, on='Doc_ID', how='inner')

In [48]:
combined_frame.head()

Unnamed: 0,Doc_ID,Review,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,OVERALL_RATING
0,china_beijing_holiday_inn_central_plaza,"[""Just about everything about this hotel is fa...",4.786,4.631,4.733,3.553,4.699,4.481
1,china_beijing_hilton_beijing_wangfujing,"['An excellent hotel, with the best room I hav...",4.81,4.845,4.759,4.828,4.517,4.752
2,china_beijing_hotel_g,"['It was chic, everyone was friendly, service ...",4.769,4.75,4.577,4.375,4.654,4.625
3,china_beijing_the_regent_beijing,"[""My parents and I stayed here during their vi...",4.625,4.812,4.438,4.646,4.531,4.61
4,china_beijing_the_st_regis_beijing,['this hotel was fantastic. rooms were lovely....,4.846,4.646,4.615,4.492,4.185,4.557


---

In [49]:
####################################################################################
#--- Removing the Bad Records (where there is no review) and Saving the CSV file ---
####################################################################################


# len(combined_frame[combined_frame.Review == 'No review'])    # 1195 such rows to be dropped

combined_frame.drop(combined_frame[combined_frame.Review == 'No review'].index, inplace=True)

combined_frame.to_csv('../Prepared Data/Hotel_Reviews.csv', index=False)

# len(combined_frame)    1644 rows saved

---