In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn import preprocessing


from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# read data if you are using Google Colab, in case of running this notebook on Jupyter, just omit this cell
from google.colab import files
uploaded = files.upload()

Saving training-1.csv to training-1.csv


In [0]:
df = pd.read_csv("training-1.csv")

# **Data preprocessing**

In [0]:
df.poster_gender.replace(['male', 'female'], [1, 0], inplace=True)
df.participant1_gender.replace(['male', 'female'], [1, 0], inplace=True)
df.participant2_gender.replace(['male', 'female'], [1, 0], inplace=True)
df.participant3_gender.replace(['male', 'female'], [1, 0], inplace=True)

In [0]:
cols_to_norm = ['age','number_of_likes', 'number_of_comments']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [0]:
df.app_type.replace(['poll', 'social'], 'other', inplace=True)

In [0]:
df.poster_focus.replace('none', 'other', inplace=True)

In [0]:
df = df.drop(['age', 'id', 'poster_id', 'poster_gender', 'participant1_id', 'participant2_id', 'participant3_id', 'participant1_gender', 'participant2_gender', 'participant3_gender'], axis=1)

In [0]:
df = pd.get_dummies(df)

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'rank'], df['rank'], test_size=0.2, random_state=42)

In [0]:
x_train.shape

(7534, 41)

# **Model building and fitting**

In [0]:
model = Sequential()

# The Input Layer :
model.add(Dense(128, kernel_initializer='normal',input_dim = x_train.shape[1], activation='relu'))

# The Hidden Layers :
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
model.summary()





Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               5376      
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 170,241
Trainable params: 170,241
Non-trainable params: 0
_________________________________________________________________


In [0]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [0]:
model.fit(x_train, y_train, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)




Train on 6027 samples, validate on 1507 samples
Epoch 1/500






Epoch 00001: val_loss improved from inf to 0.04142, saving model to Weights-001--0.04142.hdf5
Epoch 2/500

Epoch 00002: val_loss improved from 0.04142 to 0.03027, saving model to Weights-002--0.03027.hdf5
Epoch 3/500

Epoch 00003: val_loss improved from 0.03027 to 0.02032, saving model to Weights-003--0.02032.hdf5
Epoch 4/500

Epoch 00004: val_loss improved from 0.02032 to 0.01554, saving model to Weights-004--0.01554.hdf5
Epoch 5/500

Epoch 00005: val_loss did not improve from 0.01554
Epoch 6/500

Epoch 00006: val_loss did not improve from 0.01554
Epoch 7/500

Epoch 00007: val_loss improved from 0.01554 to 0.01497, saving model to Weights-007--0.01497.hdf5
Epoch 8/500

Epoch 00008: val_loss did not improve from 0.01497
Epoch 9/500

Epoch 00009: val_loss did not improve from 0.01497
Epoch 10/500

Epoch 00010: val_loss did not improve from 0.01497
Epoch 11/500

Epoch 00011: val_loss did not improve from 0.01497
Epoch 1

<keras.callbacks.History at 0x7f706f744a20>

# **Model evaluating**

In [0]:
results = model.evaluate(x_test, y_test, batch_size=32)
print('test loss, test acc:', results)

test loss, test acc: [0.007364540203670218, 0.007364540203670218]


# **Improved**

In [0]:
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers

In [0]:
# replace male/female values in 'gender' columns to 1/0
df.poster_gender.replace(['male', 'female'], [1, 0], inplace=True)
df.participant1_gender.replace(['male', 'female'], [1, 0], inplace=True)
df.participant2_gender.replace(['male', 'female'], [1, 0], inplace=True)
df.participant3_gender.replace(['male', 'female'], [1, 0], inplace=True)

In [0]:
feature_columns = []

numeric_cols = ['owner_influence', 'is_commented_by_connections', 'is_liked_by_me', 'is_liked_by_connections', 'poster_gender', 'poster_influence',
               'participant1_gender', 'participant1_influence', 'participant2_gender', 'participant2_influence', 'participant3_gender', 'participant3_influence']

# numeric cols
for header in numeric_cols:
  feature_columns.append(feature_column.numeric_column(header))


In [0]:
# bucketized cols

#------------------------------------------------------------------------------------------------------------
# that can be used with real time data, but here we need to use another aproach, as min value of age is 1100 hours
#df.age /= 1000
#one_min = 60
#one_hour = 60 * one_min
#one_day = 24 * one_hour

#age = feature_column.numeric_column("age")
#age_buckets = feature_column.bucketized_column(age, boundaries=[30 * one_min, one_hour, 2 * one_hour, 6 * one_hour, 12 * one_hour, one_day, 7 * one_day])
#feature_columns.append(age_buckets)

#------------------------------------------------------------------------------------------------------------

step = len(df.age) // 8
sorted_ages = sorted(df.age)
age_boundaries = [sorted_ages[i * step] for i in range(1, 8)]

age = feature_column.numeric_column("age")
age_buckets = feature_column.bucketized_column(age, boundaries=age_boundaries)
feature_columns.append(age_buckets)

In [0]:
likes_num = feature_column.numeric_column("number_of_likes")
likes_num_buckets = feature_column.bucketized_column(likes_num, boundaries=[2, 5, 10, 20, 50, 100])
feature_columns.append(likes_num_buckets)

In [0]:
comments_num = feature_column.numeric_column("number_of_comments")
comments_num_buckets = feature_column.bucketized_column(comments_num, boundaries=[1, 2, 5, 10, 20, 50, 100])
feature_columns.append(comments_num_buckets)

In [0]:
# join 'none' category to 'other'
df.poster_focus.replace('none', 'other', inplace=True)

In [0]:
app_type = feature_column.categorical_column_with_vocabulary_list(
    'app_type', df.app_type.unique())
app_type_1hot = feature_column.indicator_column(app_type)
feature_columns.append(app_type_1hot)

owner_type = feature_column.categorical_column_with_vocabulary_list(
    'owner_type', df.owner_type.unique())
owner_type_1hot = feature_column.indicator_column(owner_type)
feature_columns.append(owner_type_1hot)

poster_focus = feature_column.categorical_column_with_vocabulary_list(
    'poster_focus', ['engineering', 'sales', 'marketing', 'management', 'financial', 'other'])
poster_focus_1hot = feature_column.indicator_column(poster_focus)
feature_columns.append(poster_focus_1hot)

# functions to reduce code duplication
def participant_action(part_action):
  participant_action = feature_column.categorical_column_with_vocabulary_list(
    part_action, ['commented', 'liked', 'viewed'])
  return feature_column.indicator_column(participant_action)

def participant_focus(part_f):
  participant_focus = feature_column.categorical_column_with_vocabulary_list(
    part_f, ['engineering', 'sales', 'marketing', 'management', 'financial', 'other', 'none'])
  return feature_column.indicator_column(participant_focus)


feature_columns.append(participant_action("participant1_action"))
feature_columns.append(participant_focus("participant1_focus"))
feature_columns.append(participant_action("participant2_action"))
feature_columns.append(participant_focus("participant2_focus"))
feature_columns.append(participant_action("participant3_action"))
feature_columns.append(participant_focus("participant3_focus"))

In [0]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [0]:
df = df.drop(['id', 'poster_id', 'participant1_id', 'participant2_id', 'participant3_id'], axis=1)

In [15]:
df.head()

Unnamed: 0,age,app_type,owner_type,owner_influence,number_of_likes,number_of_comments,is_commented_by_connections,is_liked_by_me,is_liked_by_connections,poster_gender,poster_focus,poster_influence,participant1_action,participant1_gender,participant1_focus,participant1_influence,participant2_action,participant2_gender,participant2_focus,participant2_influence,participant3_action,participant3_gender,participant3_focus,participant3_influence,rank
0,63152420000.0,other,space,0.99973,1,0,0,0,1,1,marketing,0.99962,liked,1,marketing,0.99899,viewed,1,other,0.23077,viewed,1,marketing,0.23077,0.73677
1,63092780000.0,forum,space,0.99977,0,2,1,0,0,1,marketing,0.99899,commented,1,other,0.99756,commented,1,marketing,0.99899,viewed,1,other,0.23077,0.7824
2,63090490000.0,forum,space,0.99977,0,3,1,0,0,1,marketing,0.99899,commented,1,other,0.99756,commented,1,marketing,0.99895,commented,1,marketing,0.99899,0.82125
3,63013240000.0,other,space,0.99949,0,0,0,0,0,1,marketing,0.99886,viewed,1,other,0.23077,viewed,1,marketing,0.23077,viewed,1,other,0.23077,0.61522
4,63013230000.0,other,space,0.99949,0,0,0,0,0,1,marketing,0.99886,viewed,1,other,0.23077,viewed,1,marketing,0.23077,viewed,1,other,0.23077,0.61522


In [0]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [0]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('rank')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [0]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [0]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(256, activation='relu'),
  layers.Dense(512, activation='relu'),
  layers.Dense(256, activation='relu'),
  layers.Dense(1)
])

model.compile(loss='mean_absolute_error', 
              optimizer='adam', 
              metrics=['mean_absolute_error'])

In [42]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fed9ab47668>

In [43]:
results = model.evaluate(test_ds, batch_size=32)
print('test loss, test acc:', results)

test loss, test acc: [0.009424938820302486, 0.009420962072908878]
