In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import keras
import pandas as pd
from keras.layers import Dense, SimpleRNN
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.cross_validation import train_test_split
import keras.backend as K

Using TensorFlow backend.


In [2]:
# load train and test data and drop index column
df_train = pd.read_csv('/home/siamak/Projects/deep_learning/src/test_program/train_data/data.csv', index_col=0)
df_test = pd.read_csv('/home/siamak/Projects/deep_learning/src/test_program/test_data/test.csv', index_col=0)

In [3]:
print(df_train.shape)
print(df_test.shape)

(616656, 19)
(154165, 18)


# Preprocessing

In [4]:
# drop unnecessary columns in train and test data
df_train.drop(['Start_time', 'End_time', 'Name of show', 'Name of episode'], axis=1, inplace=True)
df_test.drop(['Start_time', 'End_time', 'Name of show', 'Name of episode'], axis=1, inplace=True)

In [5]:
# get the train label
train_label = df_train['Market Share_total']
df_train.drop(['Market Share_total'], axis=1, inplace=True)

In [6]:
print(df_train.head(1))
print('-------------------------------------------------')
print(df_test.head(1))

        Episode  Station     Channel Type Season  Year        Date  \
1  Vidéoclips V  V Total  General Channel   Fall  2016  2016-08-29   

  Day of week  Length              Genre First time or rerun  \
1      Monday       8  Music Video Clips                  No   

  # of episode in the season Movie? Game of the Canadiens during episode?  \
1                        Yes     No                                    No   

   Temperature in Montreal during episode  
1                                    20.4  
-------------------------------------------------
        Episode Station       Channel Type  Season  Year        Date  \
1  Mom V.F. (M)   VRAK+  Specialty Channel  Winter  2019  2019-01-22   

  Day of week  Length                            Genre First time or rerun  \
1     Tuesday       2  Ongoing Comedy Series (Sitcoms)                  No   

  # of episode in the season Movie? Game of the Canadiens during episode?  \
1                        Yes     No                       

In [7]:
# fill nan value with 0 in train and test data
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

In [8]:
# convert nominal features to numerical for train and test data
lb_make = LabelEncoder()
df_train["Episode"] = lb_make.fit_transform(df_train["Episode"])
df_train["Station"] = lb_make.fit_transform(df_train["Station"])
df_train["Channel Type"] = lb_make.fit_transform(df_train["Channel Type"])
df_train["Season"] = lb_make.fit_transform(df_train["Season"])
df_train["Year"] = lb_make.fit_transform(df_train["Year"])
df_train["Date"] = lb_make.fit_transform(df_train["Date"])
df_train["Day of week"] = lb_make.fit_transform(df_train["Day of week"])
df_train["Genre"] = lb_make.fit_transform(df_train["Genre"])
df_train["First time or rerun"] = lb_make.fit_transform(df_train["First time or rerun"])
df_train["# of episode in the season"] = lb_make.fit_transform(df_train["# of episode in the season"])
df_train["Movie?"] = lb_make.fit_transform(df_train["Movie?"])
df_train["Game of the Canadiens during episode?"] = lb_make.fit_transform(df_train["Game of the Canadiens during episode?"])

# df_train["Start_time"] = lb_make.fit_transform(df_train["Start_time"])
# df_train["End_time"] = lb_make.fit_transform(df_train["End_time"])
# df_train["Name of show"] = lb_make.fit_transform(df_train["Name of show"])
# df_train["Name of episode"] = lb_make.fit_transform(df_train["Name of episode"])


df_test["Episode"] = lb_make.fit_transform(df_test["Episode"])
df_test["Station"] = lb_make.fit_transform(df_test["Station"])
df_test["Channel Type"] = lb_make.fit_transform(df_test["Channel Type"])
df_test["Season"] = lb_make.fit_transform(df_test["Season"])
df_test["Year"] = lb_make.fit_transform(df_test["Year"])
df_test["Date"] = lb_make.fit_transform(df_test["Date"])
df_test["Day of week"] = lb_make.fit_transform(df_test["Day of week"])
df_test["Genre"] = lb_make.fit_transform(df_test["Genre"])
df_test["First time or rerun"] = lb_make.fit_transform(df_test["First time or rerun"])
df_test["# of episode in the season"] = lb_make.fit_transform(df_test["# of episode in the season"])
df_test["Movie?"] = lb_make.fit_transform(df_test["Movie?"])
df_test["Game of the Canadiens during episode?"] = lb_make.fit_transform(df_test["Game of the Canadiens during episode?"])

# df_test["Start_time"] = lb_make.fit_transform(df_test["Start_time"])
# df_test["End_time"] = lb_make.fit_transform(df_test["End_time"])
# df_test["Name of show"] = lb_make.fit_transform(df_test["Name of show"])
# df_test["Name of episode"] = lb_make.fit_transform(df_test["Name of episode"])


In [9]:
print(df_train.head(1))
print(df_test.head(1))

   Episode  Station  Channel Type  Season  Year  Date  Day of week  Length  \
1     6388       21             0       0     0     0            1       8   

   Genre  First time or rerun  # of episode in the season  Movie?  \
1     11                    0                           1       0   

   Game of the Canadiens during episode?  \
1                                      0   

   Temperature in Montreal during episode  
1                                    20.4  
   Episode  Station  Channel Type  Season  Year  Date  Day of week  Length  \
1     1487       20             1       2     0     0            5       2   

   Genre  First time or rerun  # of episode in the season  Movie?  \
1     15                    0                           1       0   

   Game of the Canadiens during episode?  \
1                                      0   

   Temperature in Montreal during episode  
1                                 -22.525  


In [10]:
print(df_train.shape)
print(df_test.shape)

(616656, 14)
(154165, 14)


In [11]:
# Normalize with min_max normalizer
min_max_normalizer = MinMaxScaler()
min_max_normalizer.fit(df_train)
#normalize train data
train_data = min_max_normalizer.transform(df_train)

#normalize test data
test_data = min_max_normalizer.transform(df_test)

In [12]:
print(type(train_data))

<class 'numpy.ndarray'>


In [13]:
print(train_data[1])
print(test_data[1])

[ 0.037242    0.91304348  0.          0.          0.          0.
  0.16666667  0.02173913  0.30769231  0.          1.          0.          0.
  0.74629259]
[ 0.00209393  0.86956522  1.          0.66666667  0.          0.
  0.83333333  0.02173913  0.57692308  0.          1.          0.          0.
  0.09298597]


# Make model

In [14]:
# dadehaye test chon label nadarand ta model ra behtar arzyabi konim, 20 hezar dade az train joda karde va serfan jahate test model az anha estefade shode
print(train_data.shape)
x_test = train_data[:20000]
y_test = train_label[:20000]
new_train_data = train_data[20000:]
new_train_label = train_label[20000:]
x_train, x_val, y_train, y_val = train_test_split(new_train_data, new_train_label, test_size=0.2, random_state=42)

(616656, 14)


In [15]:
def initializer(weight_matrix):
    return K.random_uniform(shape=weight_matrix, minval=-1.2, maxval=0.8, seed=(142))

In [None]:
model = Sequential()
model.add(Dense(64, activation='relu',
                input_shape=(train_data.shape[1],), kernel_initializer=initializer, bias_initializer='zeros'))
model.add(Dense(32, activation='relu', kernel_initializer=initializer, bias_initializer='zeros'))
model.add(Dense(16, activation='relu', kernel_initializer=initializer, bias_initializer='zeros'))
model.add(Dense(8, activation='relu', kernel_initializer=initializer, bias_initializer='zeros'))
model.add(Dense(4, activation='relu', kernel_initializer=initializer, bias_initializer='zeros'))
model.add(Dense(1, kernel_initializer=initializer, bias_initializer='zeros'))
model.compile(optimizer='adam', loss='mae', metrics=['mae'])

In [None]:
history = model.fit(x_train, y_train, epochs=500, batch_size=512, validation_data=(x_val, y_val))

In [None]:
mae_history = history.history['mean_absolute_error']

In [None]:
plt.plot(range(1, 501), mae_history, 'b', label='mean_absolute_error')
plt.xlabel('epochs')
plt.ylabel('mae_validation')
plt.title('Mean_absolute_error validation')
plt.legend()
plt.show()

In [None]:
test_mse_score, test_mae_score = model.evaluate(x_test, y_test)
print(test_mae_score)

In [None]:
prd = model.predict(x_test)
print(prd[:40])
print(y_test[:40])

In [27]:
model2=0
model2 = Sequential()
model2.add(Dense(64, activation='relu',
                input_shape=(train_data.shape[1],)))
model2.add(Dense(32, activation='relu'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(4, activation='relu'))
model2.add(Dense(1))
model2.compile(optimizer='adam', loss='mae', metrics=['mae'])
model2.summary()
model2.fit(x_train, y_train, epochs=50, batch_size=512, validation_data=(x_val, y_val), shuffle=False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 64)                960       
_________________________________________________________________
dense_14 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_15 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_16 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_17 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 5         
Total params: 3,745
Trainable params: 3,745
Non-trainable params: 0
_________________________________________________________________
Train on

Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fad9703b160>

In [28]:
test_mse_score, test_mae_score = model2.evaluate(x_test, y_test)
print(test_mae_score)

1.51149394699


In [29]:
# chon test set label nadasht, 20000 record az train joda va baraye test avaliye kenar gozashte shod. in 20000 dade
# dar faze amoozesh hich tasiri nadashte
prd = model2.predict(x_test)
print(prd[:30])
print(y_test[:30])

[[  0.4643462 ]
 [  8.0877285 ]
 [  0.64489806]
 [  5.04695225]
 [ 12.75297356]
 [  7.40807676]
 [  4.60941219]
 [  6.62042141]
 [  7.45935726]
 [  7.48206234]
 [  9.80313492]
 [  2.91742945]
 [  2.89431429]
 [  5.22084808]
 [  7.28268099]
 [ 12.40941811]
 [  5.94057035]
 [  5.99242306]
 [  7.5569663 ]
 [  6.25339317]
 [  7.41461182]
 [  7.20410633]
 [  6.38549423]
 [  1.62750328]
 [  7.6813612 ]
 [  6.30175495]
 [  2.38631964]
 [  2.39670897]
 [  3.95491219]
 [  4.33710051]]
1      0.9
2      0.5
3      0.3
4      1.7
5      2.2
6      2.7
7      2.3
8      1.4
9      7.5
10    12.1
11    10.7
12     5.5
13     5.8
14    10.5
15     9.8
16    11.4
17     8.0
18     7.8
19     7.0
20     8.6
21     3.0
22     3.3
23     3.6
24     2.1
25     9.9
26     6.7
27     0.9
28     2.3
29     2.9
30     4.9
Name: Market Share_total, dtype: float64


In [30]:
# dade haye test, tavasote modele train shode predict shode va natije dar zir aamade
prd_test = model2.predict(test_data)

In [31]:
print(prd_test[:20])

[[-0.00404403]
 [-0.00148746]
 [-0.00265792]
 [ 0.58935928]
 [ 0.00258479]
 [ 0.00326023]
 [ 0.00320402]
 [-0.00303212]
 [-0.00272837]
 [-0.00305256]
 [ 0.63427794]
 [ 0.63084519]
 [ 0.25627068]
 [ 0.63358068]
 [ 0.67063653]
 [-0.00330427]
 [-0.00330254]
 [ 0.00322559]
 [ 1.24228406]
 [ 1.40937984]]
