In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import datetime, pytz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, GroupKFold, TimeSeriesSplit

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsRegressor, KernelDensity, KDTree
from sklearn.metrics import *

import lightgbm as lgb
import xgboost as xgb

import sys, os
import random 

if not sys.warnoptions:
    import warnings
    warnings.filterwarnings("ignore")
    
from IPython.display import display, HTML, IFrame

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)
pd.set_option('max_colwidth', 400)


def set_seed(seed=4242):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
set_seed()


In [None]:
data =pd.read_csv('../input/cinema-ticket/cinemaTicket_Ref.csv')
data.head()

In [None]:
data.info()

In [None]:
data['date'] = pd.to_datetime(data.date)
data['month']= data.date.dt.month
data['week']=data.date.dt.week
data['day']=data.date.dt.day
data['quarter']=data.date.dt.quarter
data['weekDay'] =data.date.dt.weekday

### Total Sales per date

In [None]:
plt.style.use('fivethirtyeight')
data.groupby('date')['total_sales'].sum().plot(figsize=(18, 5), color='grey')

### Total sale per day by cinema

In [None]:
plt.style.use('fivethirtyeight')
data.groupby('cinema_code')['total_sales'].sum().plot(figsize=(22, 4), color='grey')

In [None]:
plt.style.use('fivethirtyeight')
cg = data.groupby('cinema_code', as_index=False)['total_sales'].sum()
cg = cg[cg.total_sales > 9.000000e+09]
plt.figure(figsize=(24, 5))
sns.barplot(data=cg, x='cinema_code', y='total_sales', palette='gray')

In [None]:
plt.style.use('fivethirtyeight')
data.groupby('film_code')['total_sales'].sum().plot(figsize=(18, 4), color='grey')

In [None]:
plt.style.use('fivethirtyeight')
fg = data.groupby('film_code', as_index=False)['total_sales'].sum()
fg = fg[fg.total_sales > 9.000000e+09]
plt.figure(figsize=(22, 6))
sns.barplot(data=fg, x='film_code', y='total_sales', palette='gray')

In [None]:
data.head()

In [None]:
data.total_sales.describe()

>#### We can plot the correlation coefficient for each lag variable. This can very quickly give an idea of which lag variables may be good candidates for use in a predictive model and how the relationship between the observation and its historic values changes over time. We could manually calculate the correlation values for each lag variable and plot the result.

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(7, 4))
pd.plotting.lag_plot(data['total_sales'], lag=1)

In [None]:
sns.set()
plt.style.use('seaborn')
plt.figure(figsize=(8, 4))
pd.plotting.lag_plot(data['total_sales'], lag=2)

In [None]:
sns.set()
plt.style.use('seaborn')
plt.figure(figsize=(7, 4))
pd.plotting.lag_plot(data['total_sales'], lag=3)

>#### Pandas provides a built-in plot called the autocorrelation plot() function.
>The plot provides the lag number along the x-axis and the correlation coefficient value
between -1 and 1 on the y-axis. The plot also includes solid and dashed lines that indicate the
95% and 99% confidence interval for the correlation values. Correlation values above these lines
are more significant than those below the line, providing a threshold or cutoff for selecting more
relevant lag values.

In [None]:
sns.set()
plt.style.use('ggplot')
plt.figure(figsize=(10, 8))
pd.plotting.autocorrelation_plot(data['total_sales']) 

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(data.corr(), annot=True, cmap='hot')

In [None]:
del data['week']


In [None]:
x = data.groupby(['month','cinema_code'])['total_sales'].sum().rename('total_sales').reset_index()
x.sort_values(['month','cinema_code'], inplace = True)

### Building rolling window and lag features

In [None]:
def build_lagandroll(df,target,  width = [2, 3]):
    

    for c in width:
        shifted = target.shift(c)
        df['lag_'+str(c)] = shifted
        window = target.rolling(window=c)
        dataframe = pd.concat([window.min(), window.mean(), window.max()], axis=1)
        dataframe.columns = ['roll'+str(c)+'_min', 'roll'+str(c)+'_mean', 'roll'+str(c)+'_max']
        df = pd.concat([df, dataframe], axis=1)
    return df

In [None]:
target = x.pop('total_sales')

df = build_lagandroll(x, target, width=[1, 2,3])
display(df.head(10))
df.shape, df.month.unique().size, 

In [None]:

plt.style.use('seaborn-poster')
target.hist(figsize=(8, 4), bins=50, color='teal')


In [None]:
target = np.log1p(target)

target.hist(figsize=(8, 4), bins=50, color='teal')

In [None]:
df = pd.concat([df, target], axis=1)
train = df.copy()
train = train[train.month <= 10]
y_train = train.pop('total_sales')

val = df.copy()
val =  val[val.month >10 ]
y_val = val.pop('total_sales')

train.shape, y_train.shape, val.shape, y_val.shape

In [None]:
plt.figure(figsize=(10, 8))
pd.plotting.autocorrelation_plot(y_train) 

#### Now we have better autocorrelation

In [None]:
train.drop('month', axis=1, inplace=True)
val.drop('month', axis=1, inplace=True)



## **LGB**

In [None]:
import lightgbm as lgb
reg_params = {
    
   'application':'regression_l1', # it is default(regression)
   'boosting':'gbdt',
    'learning_rate':0.01,
    'num_leaf':20,
    'max_depth':-1,
    #'min_data':10,
    'feature_fraction':0.85,
    'reg_alpha':1.2,
    'reg_lambda':3,
    #'max_bin':350,
    'verbosity':1,
    'bagging_fraction':0.85,
    'bagging_frequency':2,
    
    
    
    }
reg_params['metric'] = ['rmse']

In [None]:
dtrain = lgb.Dataset(train, y_train)
dval = lgb.Dataset(val, y_val)
lgb_model = lgb.train(reg_params,dtrain, num_boost_round=10000, valid_sets=[dtrain, dval],  verbose_eval=100,early_stopping_rounds=100)#, categorical_feature=['cinema_code'] )

In [None]:
feature_importances_gain = pd.DataFrame()
feature_importances_gain['feature'] = train.columns

feature_importances_split = pd.DataFrame()
feature_importances_split['feature'] = train.columns


gain_imp = lgb_model.feature_importance(importance_type='gain')
split_imp = lgb_model.feature_importance(importance_type='split')

feature_importances_gain['imp'] = gain_imp
feature_importances_split['imp'] = split_imp

plt.figure(figsize=(15, 7))
sns.barplot(data=feature_importances_gain.sort_values(by='imp', ascending=False),palette='gray',  x='imp', y='feature');
plt.title('TOP feature importance Gain')

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(data=feature_importances_split.sort_values(by='imp', ascending=False),palette='gray',  x='imp', y='feature');
plt.title('TOP feature importance Split')

In [None]:
lgb_pred = lgb_model.predict(val, num_iteration=lgb_model.best_iteration)

pred = np.expm1(lgb_pred)

act = np.expm1(y_val.values)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20, 4))
plt.plot(val.index, act, 'k', label = 'Actuals')


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20, 4))
plt.plot(val.index, pred, 'darkred', label = 'pred')

In [None]:
plt.style.use('seaborn-poster')
plt.figure(figsize=(20, 4))
plt.plot(val.index, act, 'k', label = 'Actuals', linewidth=7)
plt.plot(val.index, pred, 'red', label = 'Predicted', linewidth=2)

## **XGB**

In [None]:
import xgboost as xgb


dxtrain = xgb.DMatrix(train, label=y_train)
dxtest = xgb.DMatrix(val, label=y_val)

xgb_params = {
    'objective': 'reg:linear',  # error evaluation for multiclass training
    'booster':'gbtree',
    'max_depth':5,
    
    'eta':0.03, 
    'subsample':0.7,
    'colsample_bytree':0.7,
    'lambda':3, 
    'alpha':1.5,
    'gamma':1
}
xgb_params['eval_metric'] = ['rmse']
num_rounds = 2000
watchlist  = [(dxtrain,'train'), (dxtest,'test')]
model = xgb.train(xgb_params, dxtrain, num_rounds, watchlist, verbose_eval=100, early_stopping_rounds=100)

In [None]:
print(model.best_ntree_limit)
xgb_pred = model.predict(dxtest,ntree_limit=model.best_ntree_limit)

In [None]:
xgb_predexp = np.expm1(xgb_pred)

act = np.expm1(y_val.values)

In [None]:
xgb_predexp

In [None]:
act

In [None]:
plt.style.use('seaborn-poster')

plt.figure(figsize=(20, 6))
plt.plot(val.index, act, 'k', label = 'Actuals', linewidth=7)
plt.plot(val.index, xgb_predexp, 'red', label = 'Predicted', linewidth=2)