In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import math
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
train = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/train_1.csv.zip')
train.shape

In [None]:
train.info()

In [None]:
def get_language(page):
    res = re.search('[a-z][a-z].wikipedia.org',page)
    if res:
        return res[0][0:2]
    return 'na'

train['lang'] = train.Page.map(get_language)

from collections import Counter
print(Counter(train.lang))

In [None]:
lang_sets = {}
lang_sets['en'] = train[train.lang=='en'].iloc[:,0:-1]
lang_sets['ja'] = train[train.lang=='ja'].iloc[:,0:-1]
lang_sets['de'] = train[train.lang=='de'].iloc[:,0:-1]
lang_sets['na'] = train[train.lang=='na'].iloc[:,0:-1]
lang_sets['fr'] = train[train.lang=='fr'].iloc[:,0:-1]
lang_sets['zh'] = train[train.lang=='zh'].iloc[:,0:-1]
lang_sets['ru'] = train[train.lang=='ru'].iloc[:,0:-1]
lang_sets['es'] = train[train.lang=='es'].iloc[:,0:-1]

sums = {}
for key in lang_sets:
    sums[key] = lang_sets[key].iloc[:,1:].sum(axis=0) / lang_sets[key].shape[0]

In [None]:
import matplotlib.pyplot as plt
days = [r for r in range(sums['en'].shape[0])]

fig = plt.figure(1,figsize=[10,10])
plt.ylabel('Views per page')
plt.xlabel('Day')
plt.title('Pages in Different Languages')
labels = {'en' : 'English','ja' : 'Japanese','de':'German',
         'na' : 'Media','fr': 'French','zh':'Chinese','ru':'Russian','es':'Spanish'}

for key in sums:
    plt.plot(days,sums[key],label = labels[key])
    
    
plt.legend()
plt.show()

## Periodic Structure and FFTs

In [None]:
from scipy.fftpack import fft

def plot_with_fft(key):
    
    fig = plt.figure(1,figsize=[15,5])
    plt.ylabel('Views per page')
    plt.xlabel('Day')
    plt.title(labels[key])
    plt.plot(days,sums[key], label = labels[key])
    
    
    fig = plt.figure(2,figsize=[15,5])
    fft_complex = fft(sums[key].values)
    fft_mag = [np.sqrt(np.real(x)*np.real(x)+np.imag(x)*np.imag(x)) for x in fft_complex]
    fft_xvals = [day / days[-1] for day in days]
    npts = len(fft_xvals) // 2 + 1
    fft_mag = fft_mag[:npts]
    fft_xvals = fft_xvals[:npts]
    
    plt.ylabel('FFT Magnitude')
    plt.xlabel(f'Frequency {days[-1]}')
    plt.title('Fourier transform')
    plt.plot(fft_xvals[1:],fft_mag[1:],label = labels[key])
    
    plt.axvline(x=1./7,color='red',alpha = 0.3)
    plt.axvline(x=2./7,color='red',alpha = 0.3)
    plt.axvline(x=3./7,color='red',alpha = 0.3)
    
    plt.show()
    
for key in sums:
    plot_with_fft(key)

## Individual Entries

In [None]:
def plot_entry(key,idx):
    data = lang_sets[key].iloc[idx,1:]
    fig = plt.figure(1,figsize=(10,5))
    plt.plot(days,data)
    plt.xlabel('day')
    plt.ylabel('views')
    plt.title(train.iloc[lang_sets[key].index[idx],0])
    
    plt.show()

In [None]:
idx = [1,5,10,50,100,250,500,750,1000,1500,2000,3000,4000,5000]

for i in idx:
    plot_entry('en',i)

In [None]:
idx = [1,5,10,50,100,250,500,750,1000,1500,2000,3000,4000,5000]

for i in idx:
    plot_entry('es',i)

## Aggregated Data Compared to Popular pages

In [None]:
npages = 5
top_pages = {}
for key in lang_sets:
    print(key)
    sum_set = pd.DataFrame(lang_sets[key]['Page'])
    sum_set['total'] = lang_sets[key].sum(axis=1)
    sum_set = sum_set.sort_values('total',ascending=False)
    print(sum_set.head(10))
    top_pages[key] = sum_set.index[0]
    print('\n\n')
    
    

In [None]:
for key in top_pages:
    
    fig=plt.figure(1,figsize=(10,5))
    cols=train.columns
    cols=cols[1:-1]
    data = train.loc[top_pages[key],cols]
    plt.plot(days,data)
    plt.xlabel('Days')
    plt.ylabel('Views')
    plt.title(train.loc[top_pages[key],'Page'])
    plt.show()

## More Analysis Tools

In [None]:
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.stattools import acf

for key in top_pages:
    fig = plt.figure(1,figsize=[10,5])
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    cols = train.columns[1:-1]
    data = np.array(train.loc[top_pages[key],cols])
    data_diff = [data[i] - data[i-1] for i in range(1,len(data))]
    autocorr = acf(data_diff)
    pac = pacf(data_diff)
    
    x = [x for x in range(len(pac))]
    ax1.plot(x[1:],autocorr[1:])
    
    ax2.plot(x[1:],pac[1:])
    ax1.set_xlabel('Lag')
    ax1.set_ylabel('Autocorrelation')
    ax1.set_title(train.loc[top_pages[key],'Page'])
    
    ax2.set_xlabel('Lag')
    ax2.set_ylabel('Partial Autocorrelation')
    plt.show()

## ARIMA Models

In [None]:
from statsmodels.tsa.arima_model import ARIMA
import warnings

cols = train.columns[1:-1]
for key in top_pages:
    data = np.array(train.loc[top_pages[key],cols],'f')
    result = None
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        try:
            arima = ARIMA(data,[2,1,4])
            result = arima.fit(disp=False)
        except:
            try:
                arima = ARIMA(data,[2,1,2])
                result = arima.fit(disp=False)
                
            except:
                print(train.loc[top_pages[key],'Page'])
                print('\tARIMA failed')
    pred = result.predict(2,599,typ = 'levels')
    x = [i for i in range(600)]
    
    i=0
    
    
    plt.plot(x[2:len(data)],data[2:],label='Data')
    plt.plot(x[2:],pred,label='ARIMA Model')
    plt.title(train.loc[top_pages[key],'Page'])
    plt.xlabel('Days')
    plt.ylabel('Views')
    plt.legend()
    plt.show()
    

## 4 Models

In [None]:
page_details = train.Page.str.extract(r'(?P<topic>.*)\_(?P<lang>.*).wikipedia.org\_(?P<access>.*)\_(?P<type>.*)')

page_details[0:10]

In [None]:
unique_topic = page_details['topic'].unique()
print(unique_topic)
print('Number of Unique Topics: ',len(unique_topic))

In [None]:
print(page_details['access'].unique())
print(page_details['type'].unique())

In [None]:
fig,axs = plt.subplots(3,1,figsize=(12,12))

page_details['lang'].value_counts().sort_index().plot.bar(ax=axs[0])
axs[0].set_title('Language - Distribution')

page_details['access'].value_counts().sort_index().plot.bar(ax=axs[1])
axs[1].set_title('access - Distribution')

page_details['type'].value_counts().sort_index().plot.bar(ax=axs[2])
axs[2].set_title('type - Distribution')

## Split into Train and Validation dataset

In [None]:
train_df = pd.concat([page_details,train],axis=1)

def get_train_validate_set(train_df,test_percent):
    train_end = math.floor((train_df.shape[1]-5) * (1-test_percent))
    train_ds = train_df.iloc[:,np.r_[0,1,2,3,4,5:train_end]]
    test_ds = train_df.iloc[:,np.r_[0,1,2,3,4,train_end:train_df.shape[1]]]
    
    return train_ds,test_ds

X_train,y_train = get_train_validate_set(train_df,0.1)
print(X_train.head())
print(y_train.head())

In [None]:
X_train.head()

In [None]:
y_train

### Zoupet Predictive analytics with different approaches

### Importation and Data Cleaning

In [None]:
import warnings
import scipy
from datetime import timedelta

# Forecasting with Decomposable Model
from pylab import rcParams
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

# Fore Machine learning Approach

from statsmodels.tsa.tsatools import lagmat
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

#Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')

warnings.filterwarnings('ignore')


In [None]:
train = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/train_1.csv.zip')
train.head()

In [None]:
train_flattened = pd.melt(train[list(train.columns[-50:])+['Page']],id_vars='Page',var_name='date',value_name='Visits')
train_flattened['date'] = train_flattened['date'].astype('datetime64[ns]')
train_flattened['weekend'] = ((train_flattened.date.dt.dayofweek)//5 ==1).astype(float)
train_flattened

In [None]:
train_flattened.shape

In [None]:
df_median = pd.DataFrame(train_flattened.groupby(['Page'])['Visits'].median())
df_median.columns = ['median']

df_mean = pd.DataFrame(train_flattened.groupby('Page')['Visits'].mean())
df_mean.columns = ['mean']

train_flattened = train_flattened.set_index('Page').join(df_mean).join(df_median)

In [None]:
train_flattened.reset_index(drop=False,inplace=True)
train_flattened['weekday'] = train_flattened['date'].apply(lambda x: x.weekday())
train_flattened['year'] = train_flattened.date.dt.year
train_flattened['month'] = train_flattened.date.dt.month
train_flattened['day'] = train_flattened.date.dt.day
train_flattened

## Aggregation and Visualization

In [None]:
plt.figure(figsize=(50,8))
mean_group = train_flattened[['Page','date','Visits']].groupby(['date'])['Visits'].mean()
plt.plot(mean_group)
plt.title('Time Series - Average')
plt.show()

In [None]:
plt.figure(figsize=(50,8))
mean_group = train_flattened[['Page','date','Visits']].groupby(['date'])['Visits'].median()
plt.plot(mean_group,color = 'r')
plt.title('Time Series - Meidian')
plt.show()

In [None]:
plt.figure(figsize=(50,8))
std_group = train_flattened[['Page','date','Visits']].groupby(['date'])['Visits'].std()
plt.plot(std_group,color = 'g')
plt.title('Time Series - STD')
plt.show()

## ML APPROACH

In [None]:
times_series_means = pd.DataFrame(mean_group).reset_index(drop=False)
times_series_means['weekday'] = times_series_means['date'].apply(lambda x:x.weekday())
times_series_means['Date_str'] = times_series_means['date'].apply(lambda x:str(x))
times_series_means[['year','month','day']] = pd.DataFrame(times_series_means['Date_str'].str.split('-',2).tolist(),columns=['year','month','day'])

date_staging = pd.DataFrame(times_series_means['day'].str.split(' ',2).tolist(),columns=['day','other'])
times_series_means['day'] = date_staging['day']*1
times_series_means.drop('Date_str',axis=1,inplace=True)
times_series_means.head()



In [None]:
times_series_means.reset_index(drop=True,inplace=True)

def lag_func(data,lag):
    lag = lag
    X = lagmat(data['diff'],lag)
    lagged = data.copy()
    for c in range(1,lag+1):
        lagged['lag%d' %c] = X[:,c-1]
        
    return lagged


def diff_creation(data):
    
    data['diff'] = np.nan
    data.loc[1:,'diff'] = (data.iloc[1:,1].values - data.iloc[:len(data)-1,1].values)
    return data

df_count = diff_creation(times_series_means)

lag = 7
lagged = lag_func(df_count,lag)
last_date = lagged['date'].max()

In [None]:
lagged.head()


In [None]:
def train_test(data_lag):
    xc = ['lag%d' % i for i in range(1,lag+1)] + ['weekday'] +['day']
    split = 0.70
    xt = data_lag[(lag+1):][xc]
    yt = data_lag[(lag+1):]['diff']
    isplit = int(len(xt) * split)
    x_train,y_train,x_test,y_test = xt[:isplit],yt[:isplit],xt[isplit:],yt[isplit:]
    return x_train,y_train,x_test,y_test,xt,yt

x_train,y_train,x_test,y_test,xt,yt = train_test(lagged)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor, BaggingRegressor,AdaBoostRegressor

from sklearn.metrics import mean_absolute_error, r2_score

def modelisation(x_tr,y_tr,x_ts,y_ts,xt,yt,model0,model1):
    model0.fit(x_tr,y_tr)
    
    prediction = model0.predict(x_ts)
    r2 = r2_score(y_ts.values,model0.predict(x_ts))
    mae = mean_absolute_error(y_ts.values,model0.predict(x_ts))
    print('mae with 70% data',mae)
    
    model1.fit(xt,yt)#with all data
    
    return model1,prediction,model0

model0 = AdaBoostRegressor(n_estimators=5000,random_state=42,learning_rate=0.01)
model1 = AdaBoostRegressor(n_estimators=5000,random_state=42,learning_rate=0.01)

clr,prediction,clr0 = modelisation(x_train,y_train,x_test,y_test,xt,yt,model0,model1)

In [None]:
def pred_df(data,number_of_days):
    data_pred = pd.DataFrame(pd.Series(data['date'][data.shape[0]-1] + timedelta(days=1)),columns = ['date'])
    for i in range(number_of_days):
        inter = pd.DataFrame(pd.Series(data['date'][data.shape[0]-1] + timedelta(days=i+2)),columns=['date'])
        
        date_pred = pd.concat([data_pred,inter]).reset_index(drop=True)
        
    return data_pred

data_to_pred = pred_df(df_count,30)
data_to_pred

In [None]:
def initialisation(data_lag,data_pred,model,xtrain,ytrain,number_of_days):
    
    model.fit(xtrain,ytrain)
    
    for i in range(number_of_days):
        lag1 = data_lag.tail(1)['diff'].values[0]
        lag2 = data_lag.tail(1)['lag1'].values[0]
        lag3 = data_lag.tail(1)['lag2'].values[0]
        lag4 = data_lag.tail(1)['lag3'].values[0]
        lag5 = data_lag.tail(1)['lag4'].values[0]
        lag6 = data_lag.tail(1)['lag5'].values[0]
        lag7 = data_lag.tail(1)['lag6'].values[0]
        lag8 = data_lag.tail(1)['lag7'].values[0]
        
        data_pred['weekday'] = data_pred['date'].apply(lambda x:x.weekday)
        weekday = data_pred['weekday'][0]
        
        row = pd.Series([lag1,lag2,lag3,lag4,lag5,lag6,lag7,lag8,weekday],
                       ['lag1','lag2','lag3','lag4','lag5','lag6','lag7','lag8','weekday'])
        
        to_predict = pd.DataFrame(columns=['lag1','lag2','lag3','lag4','lag5','lag6','lag7','lag8','weekday'])
        
        prediction = pd.DataFrame(columns=['diff'])
        to_predict = to_predict.append([row])
        prediction = pd.DataFrame(model.predict(to_predict),columns=['diff'])
        
        if i == 0:
            last_predict = data_lag['Visits'][data_lag.shape[0]-1] + prediction.values[0][0]
            
        if i > 0 :
            
            last_predict = data_lag['Visits'][data_lag.shape[0]-1] + prediction.values[0][0]
            
        data_lag = pd.concat([data_lag,prediction.join(data_pred['date']).join(to_predict)]).reset_index(drop=True)
        
        data_lag['Visits'][data_lag.shape[0]-1] = last_predict
        
        #test
        data_pred = data_pred[data_pred['date']>data_pred['date'][0]].reset_index(drop=True)
        
    
    
    return data_lag


model_fin = AdaBoostRegressor(n_estimators=5000,random_state=42,learning_rate=0.01)


        
        

In [None]:
lagged = initialisation(lagged,data_to_pred,model_fin,xt,yt,30)

In [None]:
model_fin = AdaBoostRegressor(n_estimators=5000,random_state=42,learning_rate=0.01)

In [None]:
model_fin.fit(xt,yt)
    
for i in range(30):
    lag1 = lagged.tail(1)['diff'].values[0]
    lag2 = lagged.tail(1)['lag1'].values[0]
    lag3 = lagged.tail(1)['lag2'].values[0]
    lag4 = lagged.tail(1)['lag3'].values[0]
    lag5 = lagged.tail(1)['lag4'].values[0]
    lag6 = lagged.tail(1)['lag5'].values[0]
    lag7 = lagged.tail(1)['lag6'].values[0]
    lag8 = lagged.tail(1)['lag7'].values[0]
        
    data_to_pred['weekday'] = data_to_pred['date'].apply(lambda x:x.weekday)
    weekday = data_to_pred['weekday'][0]
        

In [None]:
train.shape

In [None]:
df_eda = train.iloc[:2000,:50]
df_eda.head()

In [None]:
df_eda.shape

In [None]:
from dataprep import eda as dp_eda

In [None]:
report = dp_eda.create_report(df_eda)
report