### libraries

In [1]:
import numpy as np 
import pandas as pd
import scipy.sparse
import matplotlib.pyplot as plt
%matplotlib inline


### read data

In [24]:
train = pd.read_csv('train.csv.zip', parse_dates=['DATE'])
train.head()

Unnamed: 0,DATE,ATM_ID,CLIENT_OUT
0,2015-01-01,0,91600
1,2015-01-02,0,136500
2,2015-01-03,0,335400
3,2015-01-04,0,379000
4,2015-01-05,0,344100


### generate features

In [3]:
def DivideBySTD(df):
    stds = df.groupby('ATM_ID')['CLIENT_OUT'].std()

    std_df = pd.DataFrame(stds.index)
    std_df['std'] = np.array(stds)
    
    df.CLIENT_OUT = df.CLIENT_OUT / pd.merge(df, std_df, on='ATM_ID', how='outer')['std']
    
    return df, stds


def MultiplyBySTD(df, stds):
    df.CLIENT_OUT = df.CLIENT_OUT * pd.merge(df, std_df, on='ATM_ID', how='outer')['std']
    
    return df

In [25]:
df = train

## We add features to df

df['day_of_week'] = df.DATE.dt.dayofweek
df['day_of_month'] = df.DATE.dt.day
df['day_of_year'] = df.DATE.dt.dayofyear
   
condition_holiday = (
    (df.DATE.dt.month == 1) & ((df.DATE.dt.day >= 1) & (df.DATE.dt.day <= 8)) # winter
    | (df.DATE.dt.month == 2) & ((df.DATE.dt.day >= 21) & (df.DATE.dt.day <= 25)) # around 23 feb
    | (df.DATE.dt.month == 3) & ((df.DATE.dt.day >= 6) & (df.DATE.dt.day <= 10)) # around 8 mar
    | (df.DATE.dt.month == 5) & ((df.DATE.dt.day >= 1) & (df.DATE.dt.day <= 10)) # may
    | (df.DATE.dt.month == 6) & ((df.DATE.dt.day >= 10) & (df.DATE.dt.day <= 14)) # around 12 june
    | (df.DATE.dt.month == 11) & ((df.DATE.dt.day >= 2) & (df.DATE.dt.day <= 6)) # around 4 nov
)   
df = df[-condition_holiday]

# divide by std
df, stds = DivideBySTD(df)

# rolling mean
df['rolling_mean_7_days'] = df.CLIENT_OUT.rolling(7).mean()
df['rolling_mean_30_days'] = df.CLIENT_OUT.rolling(30).mean()

# convert to dummies
list_dummies = ['day_of_week', 
                'day_of_month',
                'day_of_year']
df = pd.get_dummies(df, columns=list_dummies)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In the end:

In [5]:
# # multiply back by std
# df = MultiplyBySTD(df, stds)

In [26]:
df.head(20)

Unnamed: 0,DATE,ATM_ID,CLIENT_OUT,rolling_mean_7_days,rolling_mean_30_days,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,...,day_of_year_357,day_of_year_358,day_of_year_359,day_of_year_360,day_of_year_361,day_of_year_362,day_of_year_363,day_of_year_364,day_of_year_365,day_of_year_366
8,2015-01-09,0,1.902846,,,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,2015-01-10,0,1.745464,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,2015-01-11,0,1.342988,,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,2015-01-12,0,3.990352,,,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,2015-01-13,0,4.247619,,,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,2015-01-14,0,2.256639,,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
14,2015-01-15,0,2.758581,2.606356,,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
15,2015-01-16,0,1.750081,2.584532,,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
16,2015-01-17,0,0.744519,2.44154,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,2015-01-18,0,0.795301,2.363299,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
