In [1]:
import os
import re
import glob

import pandas as pd
from tqdm import tqdm
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import pacf

In [2]:
def concat_csv(data_path:str='./data/'):
    '''Read all csv files in data_path and add them one after another vertically.

    Args:
        data_path (str, optional): The path to look for csv_files. Defaults to './data/'.

    Returns:
        pd.DataFrame: Concatenated pandas dataframe
    '''
    print(f'Reading data from:  {data_path}')
    all_files = glob.glob(os.path.join(data_path , "*.csv"))
    dataframes = []
    row_count = 0
    for filename in tqdm(all_files):
        if filename.endswith('.csv'):
            df = pd.read_csv(filename, index_col=None, header=0)
            row_count += (len(df))
            dataframes.append(df)
    print(f'Total row count: {row_count}')
    frame = pd.concat(dataframes, axis=0)
    return frame

In [3]:
df = concat_csv()
# save the merged data to disk
df.to_csv('merged_reviews.csv')
# Peak the data
print(len(df))
df.head()

Reading data from:  ./data/


100%|██████████| 232/232 [00:06<00:00, 33.50it/s]


Total row count: 199353
199353


Unnamed: 0.1,Unnamed: 0,date,review_text,rating
0,0,May 5,Five stars hotel from beginig to end. Permisse...,5
1,1,Feb 2023,Paradise does indeed exist!\nI absolutely LOVE...,4
2,2,Nov 2022,Had an absolutely fantastic time at this prope...,5
3,3,Oct 2022,Hotel is in a beautiful setting and the servic...,4
4,4,Oct 2022,So worthy of their Conde Nast award - and beyo...,5


In [4]:
# Drop the index column
df = df.drop('Unnamed: 0', axis=1)
# Drop the null rows
df = df.dropna()
# See how many rows left
print(len(df))

199242


In [5]:
# Fix the faulty dates of the last month(May)
proper_dates = []
pattern = r'\d{4}$'
# Iterate through date column
for date in df['date'].values:
    # Search for values that ends with 4 numbers which is a year
    if re.search(pattern, date):
        proper_dates.append(date)
    # 
    else:
        proper_dates.append(f'{date[:3]} 2023')

In [9]:
df['date'] = np.array(proper_dates)

In [10]:
df.head()

Unnamed: 0,date,review_text,rating
0,May 2023,Five stars hotel from beginig to end. Permisse...,5
1,Feb 2023,Paradise does indeed exist!\nI absolutely LOVE...,4
2,Nov 2022,Had an absolutely fantastic time at this prope...,5
3,Oct 2022,Hotel is in a beautiful setting and the servic...,4
4,Oct 2022,So worthy of their Conde Nast award - and beyo...,5


In [11]:
df['date'] = pd.to_datetime(df['date'])

In [16]:
df['date'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))
df

Unnamed: 0,date,review_text,rating
0,2023-05,Five stars hotel from beginig to end. Permisse...,5
1,2023-02,Paradise does indeed exist!\nI absolutely LOVE...,4
2,2022-11,Had an absolutely fantastic time at this prope...,5
3,2022-10,Hotel is in a beautiful setting and the servic...,4
4,2022-10,So worthy of their Conde Nast award - and beyo...,5
...,...,...,...
975,2017-07,Great hotel! Design and furnishings are beauti...,5
976,2017-07,This is a beautifull hotel with great service!...,5
977,2017-07,This is not a 5 Star hotel! The junior suite w...,2
978,2017-07,I booked this holiday based on the experience ...,4


b1 * x1 + b2 * x2 + .....
y(t+1) = a + b1 * x(t-1) + b2 * x(t-2)

In [18]:
GB = df.groupby([(df['date'].dt.year), (df['date'].dt.month),]).mean()
GB

AttributeError: Can only use .dt accessor with datetimelike values