# Processing the reviews files.

In [1]:
# imports
import csv
import glob
import os

from dataclasses import asdict, dataclass, field


## Data model creating 

In [2]:
@dataclass(frozen=True)
class Review:
    reviewer_name: str = field()
    review_time: str = field()
    review: str = field(compare=False, hash=False)
    rating: int = field()
    reply: bool = field()
    reply_text: str = field(compare=False, hash=False)
    review_link: str = field()
    

## Getting list of all files

In [3]:
# get all text files, from a file path.

def get_files(filepath: str) -> list:
    """
    :param filepath: str 
    :return: list[str]
    """
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.txt'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files

## Read the file

In [4]:
def read_file(file_path: str)-> str:
    """

    :param file_path:
    :return:
    """
    with open(file_path, 'r') as f:
        data = f.read()
    return data

## Process the data files.

In [5]:
def process_data(text: str) -> list:
    """

    :param text:
    :return:
    """
    data = text.split(',[[[')
    data = [d for d in data]
    data = data[1]
    data = data.encode('utf-8').decode('unicode-escape')
    data = data.split('=="],[[')
    data = [d.replace(']', '').replace('[', '').replace(',null', '').split(',') for d in data]
    return data


## After processing multiple files, I found that there are multiple ways to find the information, which I needed.
    
      i) The reviewer name is always at 1st index.
      ii) The review time is at 4th index.
      iii) The review content starts at 5th index, and it can continue until the next index is not starting with space
        or /n tag.
      iv) The review rating is always at the next index of review content ending index.
      v) The reply from owner is always at the 2nd next index of '"Als unangemessen melden"', So if we have that index
        starting with '"vor' which denotes it's the reply time information, then we mark the reply True else False
      vi) The actual reply from the owner is always at the next index of the reply time stamp, and it follows the same
        approach of using multiple index, so we can continue until the next index is not starting with space or /n tag.
      vii) The review link is always starts with 'https://www.google.com/maps/reviews/data='.
      
 ### Note: This defined order is only feasible with data processing, which I have done.

In [6]:
# get process review 

def process_review(content: list) -> Review:
    """

    :param content:
    :return:
    """
    
    review_text = ''
    reviewer_name = content[1]
    review_time_information = content[4]
    index = 5
    while True:
        review_text += content[index]
        index += 1
        if not (content[index].startswith(' ') or content[index].startswith('\n')):
            break
    rating = content[index]
    index = content.index('"Als unangemessen melden"')
    index += 2
    reply =  True if content[index].startswith('"vor') else False
    reply_content = ''
    if reply:
        index += 1
        while True:
            reply_content += content[index]
            index += 1
            if not (content[index].startswith(' ') or content[index].startswith('\n')):
                break
    review_link = [a for a in content if 'https://www.google.com/maps/reviews/data=' in a]
    review_link = review_link[0] if review_link else None
    review = Review(
        reviewer_name, review_time_information, review_text, rating, reply,
        reply_content, review_link
    )
    return review

In [7]:
# files directory
files_directory = '../files/input/'


In [8]:
# get all text files name
all_files = get_files(files_directory)

In [9]:
# checking number of files
print(len(all_files))

# checking top 5 files
print(all_files[:5])

33
['/mnt/smb-d/testing/google-reviews/files/input/reviews_file_0.txt', '/mnt/smb-d/testing/google-reviews/files/input/reviews_file_1.txt', '/mnt/smb-d/testing/google-reviews/files/input/reviews_file_10.txt', '/mnt/smb-d/testing/google-reviews/files/input/reviews_file_11.txt', '/mnt/smb-d/testing/google-reviews/files/input/reviews_file_12.txt']


In [10]:
# process 1 file

data = read_file(all_files[0])

#process the raw data.
data = process_data(data)

In [11]:
# process the reviews.
reviews_list = [process_review(reviewer) for reviewer in data]

In [12]:
reviews_list

[Review(reviewer_name='"Michelle Isenberg"', review_time='"vor 5 Monaten"', review='"Wir waren zu zweit fÃ¼r eine Nacht im Hotel und waren rundum zufrieden!\nDas Zimmer ist wirklich extrem groÃ\x9f sehr schÃ¶n eingerichtet und sauber.\nIn vielen Bewertungen haben wir negatives Ã¼ber das FrÃ¼hstÃ¼ck gelesen - das konnten wir Ã¼berhaupt nicht nachvollziehen. Das FrÃ¼hstÃ¼ck war unser absolutes Highlight. Klein aber fein - QualitÃ¤t Ã¼ber QuantitÃ¤t.\nEinzig das Personal wirkte etwas reserviert es waren aber auch viele ganz neue Mitarbeiter vor Ort die sicher erstmal ankommen mÃ¼ssen.\nAlles in Allem ein wirklich toller Aufenthalt!"', rating='5', reply=True, reply_text='"Ohhhh Michelle du machst uns glÃ¼cklich mit deinem Feedback.\nUnd du hast das richtig gesehen wir stellen unser Team - insbesondere an der Rezeption- grade neu zusammen. Alle hoch motiviert und offensichtlich konzentriert ;) danke fÃ¼r dein VerstÃ¤ndnis und wir freuen uns dich bald wieder begrÃ¼Ã\x9fen zu dÃ¼rfen.\n\nHerz

In [13]:
asdict(reviews_list[0]).keys()

dict_keys(['reviewer_name', 'review_time', 'review', 'rating', 'reply', 'reply_text', 'review_link'])

In [14]:
asdict(reviews_list[0])

{'reviewer_name': '"Michelle Isenberg"',
 'review_time': '"vor 5 Monaten"',
 'review': '"Wir waren zu zweit fÃ¼r eine Nacht im Hotel und waren rundum zufrieden!\nDas Zimmer ist wirklich extrem groÃ\x9f sehr schÃ¶n eingerichtet und sauber.\nIn vielen Bewertungen haben wir negatives Ã¼ber das FrÃ¼hstÃ¼ck gelesen - das konnten wir Ã¼berhaupt nicht nachvollziehen. Das FrÃ¼hstÃ¼ck war unser absolutes Highlight. Klein aber fein - QualitÃ¤t Ã¼ber QuantitÃ¤t.\nEinzig das Personal wirkte etwas reserviert es waren aber auch viele ganz neue Mitarbeiter vor Ort die sicher erstmal ankommen mÃ¼ssen.\nAlles in Allem ein wirklich toller Aufenthalt!"',
 'rating': '5',
 'reply': True,
 'reply_text': '"Ohhhh Michelle du machst uns glÃ¼cklich mit deinem Feedback.\nUnd du hast das richtig gesehen wir stellen unser Team - insbesondere an der Rezeption- grade neu zusammen. Alle hoch motiviert und offensichtlich konzentriert ;) danke fÃ¼r dein VerstÃ¤ndnis und wir freuen uns dich bald wieder begrÃ¼Ã\x9fen zu 

# Batch processing

## Processing all the reviews files, and creating a single csv

In [15]:
all_reviews = []
for file in all_files:
    text = read_file(file)
    data = process_data(text)
    reviews_list = [process_review(reviewer) for reviewer in data]
    all_reviews.append(reviews_list)


In [16]:
print(len(all_reviews))
all_reviews = [review for reviews_list in all_reviews for review in reviews_list]
print(len(all_reviews))

33
92


In [17]:
with open('../files/output/clean_reviews.csv', 'a', newline='') as f:
    fieldnames = ['reviewer_name', 'review_time', 'review', 'rating', 'reply', 'reply_text', 'review_link']
    csv_writer = csv.DictWriter(f, fieldnames=fieldnames)
    csv_writer.writeheader()
    for review in all_reviews:
        csv_writer.writerow(asdict(review))