In [1]:
# Load CSV Data
import csv
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import svm


FILEPATH = 'history.csv'
HEADERS = ['member_id', 'venue_id', 'day_of_week', 'date', 'month', 'attendance']

FEATURES = {'member_id': int, 'venue_id': int, 'date': int, 'month': int}
LABEL = 'attendance'

X, Y = [], []
with open(FILEPATH, 'r') as f:
    reader = csv.DictReader(f, fieldnames=HEADERS)
    next(reader)
    X, Y = zip(*[
        ([typ(row[key]) for key, typ in FEATURES.items()], row.get(LABEL))
        for row in reader
    ])

print("FileRead Done!")
print(np.array(X).shape)

FileRead Done!
(33473, 4)


In [2]:
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [3]:
# Y_train = [int(n) for n in Y_test if n != ""]
# There is blank label at Y_train(20019)
def to_int(list):
    temp = []
    for n in list:
        if n == "":
            n = 0
        temp.append(int(n))
    return temp
Y_train = to_int(Y_train)
Y_test = to_int(Y_test)

# Descirbing important feature
We got 5 columns
'member_id', 'venue_id', 'day_of_week', 'date', 'month', 'attendance'
we might merge the 'date' and 'month' into single set

In [4]:
#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

In [5]:
%matplotlib inline

In [6]:
# Load data using Panda for easy visualization
import pandas as pd

raw_data = pd.read_csv('history.csv')

In [7]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33473 entries, 0 to 33472
Data columns (total 6 columns):
member_id      33473 non-null int64
venue_id       33473 non-null int64
day_of_week    33473 non-null int64
date           33473 non-null int64
month          33473 non-null int64
attendance     33472 non-null float64
dtypes: float64(1), int64(5)
memory usage: 1.5 MB


In [8]:
raw_data.shape

(33473, 6)

In [9]:
raw_data.sample(10)

Unnamed: 0,member_id,venue_id,day_of_week,date,month,attendance
29224,182780146,24130924,0,12,2,0.0
25659,173697972,24511188,0,29,5,0.0
29789,182944352,24511188,6,28,1,0.0
741,8893278,24511188,6,11,2,0.0
14055,125870292,24511188,6,12,8,0.0
26503,177738432,24511188,6,5,8,0.0
2140,12441344,24995655,6,29,7,0.0
2161,12674691,24130924,1,4,1,0.0
9391,82971042,24511188,0,29,5,0.0
31328,183337985,24511188,0,10,9,0.0


In [10]:
raw_data.describe()

Unnamed: 0,member_id,venue_id,day_of_week,date,month,attendance
count,33473.0,33473.0,33473.0,33473.0,33473.0,33472.0
mean,118180700.0,24457060.0,2.299585,15.078481,6.383593,0.041079
std,57663960.0,333440.5,2.733479,8.307373,3.172411,0.198476
min,624285.0,23824660.0,0.0,1.0,1.0,0.0
25%,72744210.0,24130920.0,0.0,8.0,4.0,0.0
50%,135234400.0,24511190.0,0.0,16.0,6.0,0.0
75%,172698500.0,24511190.0,6.0,22.0,9.0,0.0
max,184674700.0,25292350.0,6.0,31.0,12.0,1.0


In [11]:
raw_data["member_id"].value_counts()

143318852    166
57775672     166
135982742    166
182490393    166
72744212     166
60341372     166
11866825     166
47655342     166
158214542    166
61991062     166
141400152    166
129386402    166
155833392    166
8893278      166
16898961     166
156147902    166
130764512    166
168500682    166
112295142    166
160177162    166
138505092    166
96222062     166
183194403    166
182438032    166
183168886    166
134116642    166
182930853    166
42870312     166
131304382    166
94006082     166
            ... 
135234432    166
66032482     166
182944352    166
12674691     166
113404702    166
123480982    166
142974532    166
182691175    166
184538349    166
35150972     166
79330232     166
133970832    166
80157172     166
183048532    166
182706093    166
160774032    166
10094358     166
37140032     166
17489101     166
105783602    166
131493592    166
182692576    166
56781112     166
88397742     166
136430412    166
74055732     166
183435212    166
155316142    1

In [12]:
raw_data["venue_id"].value_counts()

24511188    13103
24130924     8888
24995655     3417
24132735     3030
25072251     2412
24446531      808
23824656      606
24389241      404
24130927      202
25292352      201
24546934      201
25187720      201
Name: venue_id, dtype: int64

### Feature Analysis

In [16]:
print (raw_data[['member_id', 'attendance']].groupby(['member_id'], as_index=False).mean())

     member_id  attendance
0       624285    0.012048
1      8562235    0.006024
2      8710693    0.012048
3      8731170    0.024096
4      8893278    0.006024
5      9793906    0.072289
6     10094358    0.006024
7     10746668    0.006024
8     10884001    0.018072
9     11258304    0.018072
10    11866825    0.036145
11    12161455    0.006024
12    12441344    0.012048
13    12674691    0.006024
14    13120285    0.512048
15    13438334    0.024096
16    13936335    0.006024
17    14136203    0.012048
18    14649181    0.006024
19    15442181    0.006024
20    16898961    0.036145
21    17489101    0.006024
22    17667091    0.012048
23    21751091    0.018072
24    24544272    0.018072
25    27685792    0.012048
26    32499972    0.006024
27    35150972    0.006024
28    36695872    0.295181
29    37140032    0.054217
..         ...         ...
172  182692576    0.012048
173  182695344    0.006024
174  182706093    0.012048
175  182741567    0.349398
176  182780146    0.006024
1

In [17]:
print (raw_data[['venue_id', 'attendance']].groupby(['venue_id'], as_index=False).mean())

    venue_id  attendance
0   23824656    0.031353
1   24130924    0.044892
2   24130927    0.009901
3   24132735    0.031023
4   24389241    0.047030
5   24446531    0.039604
6   24511188    0.046329
7   24546934    0.034826
8   24995655    0.031021
9   25072251    0.031095
10  25187720    0.004975
11  25292352    0.069652


In [18]:
print (raw_data[['day_of_week', 'attendance']].groupby(['day_of_week'], as_index=False).mean())

   day_of_week  attendance
0            0    0.041671
1            1    0.038504
2            2    0.004950
3            3    0.033003
4            4    0.039075
5            5    0.050743
6            6    0.041281


In [None]:
for dataset in raw_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
print (train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

# Cleaning the samples

In [13]:
print('Train columns with null values: \n', raw_data.isnull().sum())
print("-"*10)
print (raw_data.info())
print("-"*10)

Train columns with null values: 
 member_id      0
venue_id       0
day_of_week    0
date           0
month          0
attendance     1
dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33473 entries, 0 to 33472
Data columns (total 6 columns):
member_id      33473 non-null int64
venue_id       33473 non-null int64
day_of_week    33473 non-null int64
date           33473 non-null int64
month          33473 non-null int64
attendance     33472 non-null float64
dtypes: float64(1), int64(5)
memory usage: 1.5 MB
None
----------


In [5]:
from keras.utils import to_categorical

num_classes = 2

Y_train_categorical = to_categorical(Y_train, num_classes)
Y_test_categorical = to_categorical(Y_test, num_classes)

Using TensorFlow backend.


In [6]:
print(np.array(X_train).shape)

(23431, 4)


In [17]:
from keras.models import Sequential
from keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(60, input_dim=4, kernel_initializer='normal', activation='relu'))
model.add(Dense(2, kernel_initializer='normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 60)                300       
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 122       
Total params: 422.0
Trainable params: 422
Non-trainable params: 0.0
_________________________________________________________________


In [18]:
from keras.utils import plot_model
plot_model(model, show_shapes=True, to_file="attendance_predictor.png")

In [19]:
history = model.fit(X_train, Y_train_categorical, batch_size=32, epochs=10, verbose=1, callbacks=None,
    validation_data=(X_test, Y_test_categorical), shuffle=True)

Train on 23431 samples, validate on 10042 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
x = np.array([[624285, 25072259, 2, 7]])
model.predict(x, batch_size=32, verbose=0)

array([[ 1.,  0.]], dtype=float32)