In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, HTML, display_html
import seaborn as sns
import datetime

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/insideairbnb/reviews.csv')

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df.iloc[:5,:5]

In [None]:
#two row and all column show
df.iloc[:2,:5]

In [None]:
df.iloc[:5,:5].dtypes

In [None]:
# see if any columns have nulls.
df.iloc[:5,:5].isnull().any()

In [None]:
df.describe(percentiles=[0.25, 0.5, 0.75, 0.85, 0.95, 0.99])

In [None]:
df.head(2)

In [None]:
df = df.rename(columns={'date':'ds', 'listing_id':'ts'})

df_example = df.groupby(by='ds').agg({'ts':'count'})

In [None]:
df_example.head(2)

In [None]:
df_example.index= pd.to_datetime(df_example.index)

In [None]:
df_example.head(2)

In [None]:
# set frequency of time series
df_example = df_example.asfreq(freq='1D')

In [None]:
df_example.head(5)

In [None]:
# sor the values
df_example = df_example.sort_index(ascending=True)

In [None]:
df_example.head(3)

In [None]:
# Fill values with 0 
df_example = df_example.fillna(value=0)

In [None]:
df_example.head(5)

In [None]:
# show the end of the data
df_example.tail()

In [None]:
f, ax = plt.subplots(1,1)
ax.plot(df_example['ts'])
ax.set_title('Time-series graph')
ax.tick_params(axis='x', rotation=45)

plt.show()
plt.close()

In [None]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(df,ts):
    rolmean = df[ts].rolling(window=12, center = False).mean()
    rolstd = df[ts].rolling(window=12, center =False).std()
    
    # Plot rolling statistics:
    orig = plt.plot(df[ts], color='blue', label='Original')
    
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    
    std = plt.plot(rolstd, color='black', label='rolling std')
    
    plt.legend(loc='best')
    plt.title('Rolling mean & standard deviation for %s' % (ts))
    
    plt.xticks(rotation=45)
    plt.show(block=False)
    plt.close()
    
    
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(df[ts], autolag='aic')
    
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value',' # Lags Used', 'Number of Observeations Used'])
    
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(df=df_example, ts='ts')

In [None]:
def plot_transformed_data(df, ts, ts_transform):
    # Plot time series data
    f, ax = plt.subplots(1,1)
    ax.plot(df[ts])
    ax.plot(df[ts_transform], color='red')
    
    ax.set_title('%s and %s time-series graph'%(ts, ts_transform))
    
    ax.tick_params(axis='x', rotation=45)
    ax.legend([ts, ts_transform])
    
    plt.show()
    plt.close()
    

In [None]:
# Transformation -log ts

df_example['ts_log'] = df_example['ts'].apply(lambda x:np.log(x))

# Transformation 7  day moving averages of log ts

df_example['ts_log_moving_avg'] = df_example['ts_log'].rolling(window=7, center=False).mean()

# Transformation -7 day moving average ts
df_example['ts_moving_avg'] = df_example['ts'].rolling(window=7, center=False).mean()

df_example['ts_log_diff'] = df_example['ts_log'].diff()

# Transformation - Difference between ts and moving average ts

df_example['ts_moving_avg_diff'] = df_example['ts']-df_example['ts_moving_avg']

# Transordiffmation - Difference between logged ts and logged moving
df_example['ts_log_moving_avg_diff']= df_example['ts_log']- df_example['ts_log_moving_avg']

# Transformation - Difference between logged ts and logged moving average
df_example_transform= df_example.dropna()

# Transformation - Logged exponentially weighted moving averages [EWMA]

df_example_transform['ts_log_ewma'] = df_example_transform['ts_log'].ewm(halflife=7, ignore_na = False, min_periods = 0, adjust=True).mean()


# Transformation - Difference between logged ts and logged ts  and logged EWMA ts

df_example_transform['ts_log_ewma_diff'] = df_example_transform['ts_log']- df_example_transform['ts_log_ewma']

# Display data
df_example_transform.head()

#plot data
plot_transformed_data(df= df_example, ts='ts', ts_transform='ts_log')

# Plot Data
plot_transformed_data(df=df_example, ts='ts_log', ts_transform='ts_log_moving_avg')

# Plot data
plot_transformed_data(df=df_example_transform, ts='ts', ts_transform='ts_moving_avg')

# plot data
plot_transformed_data(df=df_example_transform, ts='ts_log', ts_transform='ts_log_diff')

# plot data
plot_transformed_data(df=df_example_transform, ts='ts', ts_transform='ts_moving_avg_diff')

# plot data
plot_transformed_data(df=df_example_transform, ts='ts_log', ts_transform='ts_log_moving_avg_diff')

# plot
plot_transformed_data(df=df_example_transform, ts='ts_log', ts_transform='ts_log_ewma')

#plot
plot_transformed_data(df=df_example_transform, ts='ts_log', ts_transform='ts_log_ewma_diff')

#perform stationarity test
test_stationarity(df=df_example_transform, ts='ts_log')

# perform stationarity test
test_stationarity(df=df_example_transform, ts='ts_moving_avg')

In [None]:
def plot_decomposition(df, ts, trend, seasonal, residual):
    """
    Plot time series data
    """
    
    f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15), sharex=True)
    
    ax1.plot(df[ts], label='Original')
    ax1.legend(loc='best')
    ax1.tick_params(axis='x', rotation=45)
    
    ax2.plot(df[trend], label ='Trend')
    ax2.legend(loc='best')
    ax2.tick_params(axis='x', rotation=45)
    
    ax3.plot(df[seasonal], label='Seasonality')
    ax3.legend(loc='best')
    ax3.tick_params(axis='x', rotation=45)
    
    ax4.plot(df[residual], label='Residuals')
    ax4.legend(loc='best')
    ax4.tick_params(axis='x', rotation=45)
    plt.tight_layout()
    
    
    # Show graph
    plt.suptitle('Trend seasonal, and Residual Decomposition of %s' %(ts), x=0.5, y=1.05, fontsize=18)
    plt.show()
    plt.close()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = seasonal_decompose(df_example_transform['ts_log'], freq=365)

df_example_transform.loc[:, 'trend'] = decomposition.trend
df_example_transform.loc[:,'seasonal'] = decomposition.seasonal
df_example_transform.loc[:, 'residual'] = decomposition.resid

plot_decomposition(df=df_example_transform, ts='ts_log', trend='trend', seasonal='seasonal', residual='residual')

test_stationarity(df=df_example_transform.dropna(), ts='residual')

