In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
np.random.seed(10)

%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [1]:
#Import Dataset
train_df = pd.read_csv('../input/atm-data/ATM_training.csv')
test_df = pd.read_csv('../input/atm-data/ATM_test.csv')

# Exploratory Data Analysis

In [1]:
# Histogram of the Dependent Variable
sns.distplot(train_df['Withdraw']);

In [1]:
#scatter plot of Shops and Withdraw
var = 'Shops'
data = pd.concat([train_df['Withdraw'], train_df[var]], axis=1)
data.plot.scatter(x=var, y='Withdraw', ylim=(0,150));

In [1]:
# Box plot of ATMs and Withdraw
var = 'ATMs'
data = pd.concat([train_df['Withdraw'], train_df[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="Withdraw", data=data)
fig.axis(ymin=0, ymax=150);

In [1]:
# Boxplot of Weekday and Withdraw
var = 'Weekday'
data = pd.concat([train_df['Withdraw'], train_df[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="Withdraw", data=data)
fig.axis(ymin=0, ymax=150);

In [1]:
#skewness and kurtosis
print("Skewness: %f" % train_df['Withdraw'].skew())
print("Kurtosis: %f" % train_df['Withdraw'].kurt())

In [1]:
#correlation matrix
corrmat = train_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

In [1]:
#Withdraw correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'Withdraw')['Withdraw'].index
cm = np.corrcoef(train_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [1]:
# Scatter plot
sns.set()
cols = ['Shops', 'ATMs', 'Downtown', 'Weekday', 'Center', 'High', 'Withdraw']
sns.pairplot(train_df[cols], size = 2.5)
plt.show();

In [1]:
# Checking missing data
total = train_df.isnull().sum().sort_values(ascending=False)
percent = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [1]:
#standardizing data
from sklearn.preprocessing import StandardScaler

saleprice_scaled = StandardScaler().fit_transform(train_df['Withdraw'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [1]:
#histogram and normal probability plot
from scipy.stats import norm, stats
from scipy import stats

sns.distplot(train_df['Withdraw'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_df['Withdraw'], plot=plt)

In [1]:
train_df.head()

In [1]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"Withdraw":train_df["Withdraw"], "log(Withdraw + 1)":np.log1p(train_df["Withdraw"])})
prices.hist()

In [1]:
all_data = pd.concat((train_df.loc[:,'Shops':'High'],
                      test_df.loc[:,'Shops':'High']))

In [1]:
#log transform the target:
train_df["Withdraw"] = np.log1p(train_df["Withdraw"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train_df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [1]:
all_data = pd.get_dummies(all_data)


In [1]:
all_data.head()

# Ridge Regression

In [1]:
#creating matrices for sklearn:
X_train = all_data[:train_df.shape[0]]
X_test = all_data[train_df.shape[0]:]
y = train_df.Withdraw
y_test = test_df.iloc[:,-1]

In [1]:
# Import libraries for Logic-based algorithms
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error, explained_variance_score
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, Lasso
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from sklearn.kernel_ridge import KernelRidge

def mse_cv(model):
    mse= -cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5)
    return(mse)

In [1]:
# Cross validation and hyperparmater tuning for ridge regression so we can visualize
# MSE in a plot
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [mse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]

In [1]:
# Plot MSE mean for each cross validation and hyperparameter
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Hypertuning-Ridge Regression")
plt.xlabel("alpha")
plt.ylabel("MSE")

In [1]:
cv_ridge.min()

In [1]:
# Fit ridge regression cross validation
model_ridge = RidgeCV(alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]).fit(X_train, y)


In [1]:
# predict on test set
pred_ridge_all =  np.expm1(model_ridge.predict(X_test))


In [1]:
# compare to true results
mse = MSE(y_test, pred_ridge_all)
print("MSE : % f" %(mse))

In [1]:
# Check differences between true value and predicted value
difference = y_test - pred_ridge_all

In [1]:
data = {'y_test': y_test, 'values': pred_ridge_all, 'Error': difference}
df = pd.DataFrame(data=data)
df

In [1]:
abs(df['Error']).min(), abs(df['Error']).max()

In [1]:
# plot coefficients
coef = pd.Series(model_ridge.coef_, index = X_train.columns)
print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
imp_coef = pd.concat([coef.sort_values().head(10)])
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Ridge Model")

# LASSO

In [1]:
# Cross validation and hyperparmater tuning for LASSO regression so we can visualize
# MSE in a plot
alphas = [1, 0.1, 0.001, 0.0005]
cv_lasso = [mse_cv(Lasso(alpha = alpha)).mean() 
            for alpha in alphas]

In [1]:
# Plot MSE mean for each cross validation and hyperparameter
cv_lasso = pd.Series(cv_lasso, index = alphas)
cv_lasso.plot(title = "Hypertuning-Lasso")
plt.xlabel("alpha")
plt.ylabel("MSE")

In [1]:
cv_lasso.min()

In [1]:
# Fit LASSO regression cross validation
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)


In [1]:
mse_cv(model_lasso).mean()

In [1]:
coef = pd.Series(model_lasso.coef_, index = X_train.columns)

In [1]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [1]:
imp_coef = pd.concat([coef.sort_values().head(10)])

In [1]:
# visualize the weighted coefficients
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

In [1]:
#let's look at the residuals as well:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)

preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")

In [1]:
# Predict using LASSO on test set
lasso_preds = np.expm1(model_lasso.predict(X_test))

In [1]:
# Calculate MSE for test set
mse = MSE(y_test, lasso_preds)
print("MSE : % f" %(mse))

In [1]:
min(abs(y_test - lasso_preds)), max(abs(y_test - lasso_preds))

# ANN

In [1]:
# Import for ANN
from keras.layers import Dense
from keras.models import Sequential
from keras.regularizers import l1
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [1]:
#creating matrices for sklearn:
X_train = all_data[:train_df.shape[0]]
X_test = all_data[train_df.shape[0]:]
y_train = train_df.Withdraw
y_test = test_df.iloc[:,-1]

In [1]:
# Scaler Transform
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(pd.array(y_train).reshape(-1, 1))
X_train = scalerX.transform(X_train)
y_train = scalery.transform(pd.array(y_train).reshape(-1, 1))

In [1]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y, random_state = 3, train_size = 0.8)

In [1]:
# Creating the model
model = Sequential()

# input layer
model.add(Dense(19,activation='relu'))

# hidden layers
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

# output layer
model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

In [1]:
hist = model.fit(X_train, y_train, validation_data = (X_val, y_val))

In [1]:
pd.Series(model.predict(X_test)[:,0]).hist()


In [1]:
predictions = pd.Series(model.predict(X_test)[:,0])


In [1]:
predictions = scalery.inverse_transform(predictions)

In [1]:
# predictions on the test set

print('MAE: ',mean_absolute_error(y_test,predictions))
print('MSE: ',MSE(y_test,predictions))
print('RMSE: ',np.sqrt(MSE(y_test,predictions)))
print('Variance Regression Score: ',explained_variance_score(y_test,predictions))

print('\n\nDescriptive Statistics:\n',train_df['Withdraw'].describe())

In [1]:
f, axes = plt.subplots(1, 2,figsize=(15,5))

# Our model predictions
plt.scatter(y_test,predictions)

# Perfect predictions
plt.plot(y_test,y_test,'r')

errors = y_test.values.reshape(20, 1) - predictions
sns.distplot(errors, ax=axes[0])

sns.despine(left=True, bottom=True)
axes[0].set(xlabel='Error', ylabel='', title='Error Histogram')
axes[1].set(xlabel='Test True Y', ylabel='Model Predictions', title='Model Predictions vs Perfect Fit')