## Imports and constants

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gc
import time
import numpy as np
import tensorflow as tf
import pandas as pd

## Load data

In [2]:
data_path = '../input/'

In [3]:
df_train = pd.read_csv(data_path + 'train.csv')
df_test = pd.read_csv(data_path + 'test.csv')
df_songs = pd.read_csv(data_path + 'songs.csv')
df_songs_extra = pd.read_csv(data_path + 'song_extra_info.csv')
df_members = pd.read_csv(data_path + 'members.csv',
                         parse_dates=['registration_init_time', 'expiration_date'])

---

# Parse data

Fill those `song_length` are nan to 20000

In [4]:
df_songs['song_length'].fillna(20000, inplace=True)
df_songs['song_length'] = df_songs['song_length'].astype(np.uint32)

#df_songs.head()

Convert `isrc` to `song_year`

In [5]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

In [6]:
df_songs_extra['song_year'] = df_songs_extra['isrc'].apply(isrc_to_year)
df_songs_extra.drop(['isrc'], axis = 1, inplace = True)

#df_songs_extra.head()

In [7]:
df_members['membership_days'] = df_members['expiration_date'].subtract(df_members['registration_init_time']).dt.days.astype(int)

df_members['registration_year'] = df_members['registration_init_time'].dt.year
df_members['registration_month'] = df_members['registration_init_time'].dt.month
df_members['registration_day'] = df_members['registration_init_time'].dt.day
df_members = df_members.drop(['registration_init_time'], axis=1)

df_members['expiration_year'] = df_members['expiration_date'].dt.year
df_members['expiration_month'] = df_members['expiration_date'].dt.month
df_members['expiration_day'] = df_members['expiration_date'].dt.day
df_members = df_members.drop(['expiration_date'], axis=1)

#df_members.head()

---

# Merge data

In [8]:
df_songs = df_songs.merge(df_songs_extra, on='song_id', how='left')

del df_songs_extra;
gc.collect();

#df_songs.head()

In [9]:
df_train = df_train.merge(df_members, on='msno', how='left')
df_test = df_test.merge(df_members, on='msno', how='left')

df_train = df_train.merge(df_songs, on='song_id', how='left')
df_test = df_test.merge(df_songs, on='song_id', how='left')

del df_members, df_songs;
gc.collect();

#df_train.head()

In [10]:
df_train['song_length'].fillna(0, inplace=True)
df_train['language'].fillna(0, inplace=True)
df_train['song_year'].fillna(0, inplace=True)
df_train['song_length'] = df_train['song_length'].astype(np.int64)
df_train['language'] = df_train['language'].astype(np.int64)
df_train['song_year'] = df_train['song_year'].astype(np.int64)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 25 columns):
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                int64
city                  int64
bd                    int64
gender                object
registered_via        int64
membership_days       int64
registration_year     int64
registration_month    int64
registration_day      int64
expiration_year       int64
expiration_month      int64
expiration_day        int64
song_length           int64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              int64
name                  object
song_year             int64
dtypes: int64(14), object(11)
memory usage: 1.4+ GB


In [11]:
for col in df_train.columns:
    if df_train[col].isnull().any():
        if df_train[col].dtype == object:
            df_test[col].fillna('null',inplace=True)
            df_train[col].fillna('null',inplace=True)
        else:
            df_train[col].fillna(0,inplace=True)
            df_test[col].fillna(0,inplace=True)
#df_train['source_system_tab'].dtype

In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2556790 entries, 0 to 2556789
Data columns (total 25 columns):
id                    int64
msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
city                  int64
bd                    int64
gender                object
registered_via        int64
membership_days       int64
registration_year     int64
registration_month    int64
registration_day      int64
expiration_year       int64
expiration_month      int64
expiration_day        int64
song_length           float64
genre_ids             object
artist_name           object
composer              object
lyricist              object
language              float64
name                  object
song_year             float64
dtypes: float64(3), int64(11), object(11)
memory usage: 507.2+ MB


In [13]:
df_train.isnull().any()

msno                  False
song_id               False
source_system_tab     False
source_screen_name    False
source_type           False
target                False
city                  False
bd                    False
gender                False
registered_via        False
membership_days       False
registration_year     False
registration_month    False
registration_day      False
expiration_year       False
expiration_month      False
expiration_day        False
song_length           False
genre_ids             False
artist_name           False
composer              False
lyricist              False
language              False
name                  False
song_year             False
dtype: bool

In [14]:
tf.logging.set_verbosity(tf.logging.INFO)
# Set to INFO for tracking training, default is WARN 

print("Using TensorFlow version %s" % (tf.__version__))
CATEGORICAL_COLUMNS = ["msno", "song_id", "source_system_tab", "source_screen_name", 
                       "source_type", "gender", "genre_ids", "artist_name",
                       "composer", "lyricist", "name"]

CONTINUOUS_COLUMNS = ['city','bd','registered_via','membership_days','registration_year','registration_month',
                      'registration_day','expiration_year','expiration_month','expiration_day','song_length',
                      'language','song_year']

TARGET_COLUMN = 'target'
df_train['ans'] = df_train['target'].values
df_train = df_train.drop(['target'],axis=1)

Using TensorFlow version 1.4.1


In [15]:
BATCH_SIZE = 400
temp_labels = df_train['ans']
def generate_input_fn(df_train, num_epochs=None, shuffle=True, batch_size=BATCH_SIZE):
    df_train['ans'] = df_train['ans'].apply(lambda x: x == 1).astype(int)
    #del df_train['target']
    
    return tf.estimator.inputs.pandas_input_fn(
            x = df_train,
            y = df_train['ans'],
            batch_size = batch_size,
            num_epochs=num_epochs,
            shuffle=shuffle)

print('input function configured')

input function configured


## Create Feature Columns

### Sparse Columns

First we build the sparse columns.<br>
Use sparse_column_with_keys() for columns that we know all possible values for.<br>
Use sparse_column_with_hash_bucket() for columns that we want the the library to automatically map values for us.

In [16]:
# The layers module contains many utilities for creating feature columns.

# Categorical base columns.
gender = tf.feature_column.categorical_column_with_vocabulary_list(key="gender", 
                                                                   vocabulary_list=["female", "male"])
#number of unique in df_train's categorical data
########################
#msno 30755            #
#song_id 359966        #
#source_system_tab 9   #
#source_screen_name 20 #
#source_type 12        #
#gender 2              #
#genre_ids 572         #
#artist_name 40582     #
#composer 76064        #
#lyricist 33889        #
#name 234111           #
########################

msno = tf.feature_column.categorical_column_with_hash_bucket(
  "msno", hash_bucket_size=30755)
song_id = tf.feature_column.categorical_column_with_hash_bucket(
  "song_id", hash_bucket_size=359966)
source_system_tab = tf.feature_column.categorical_column_with_hash_bucket(
  "source_system_tab", hash_bucket_size=9)
source_screen_name = tf.feature_column.categorical_column_with_hash_bucket(
  "source_screen_name", hash_bucket_size=20)
source_type = tf.feature_column.categorical_column_with_hash_bucket(
  "source_type", hash_bucket_size=12)
genre_ids = tf.feature_column.categorical_column_with_hash_bucket(
  "genre_ids", hash_bucket_size=572)
artist_name = tf.feature_column.categorical_column_with_hash_bucket(
  "artist_name", hash_bucket_size=40582)
composer = tf.feature_column.categorical_column_with_hash_bucket(
  "composer", hash_bucket_size=76064)
lyricist = tf.feature_column.categorical_column_with_hash_bucket(
  "lyricist", hash_bucket_size=33889)
name = tf.feature_column.categorical_column_with_hash_bucket(
  "name", hash_bucket_size=234111)

print('Categorical columns configured')

Categorical columns configured


### Continuous Columns

Second, configure the real-valued columns using real_valued_column().

In [17]:
# Continuous base columns.
################################################################################################################
#CONTINUOUS_COLUMNS = ['city','bd','registered_via','membership_days','registration_month','registration_year',#
#                      'registration_day','expiration_year','expiration_month','expiration_day','song_length', #
#                      'language','song_year']                                                                 #
################################################################################################################
city = tf.feature_column.numeric_column("city")
bd = tf.feature_column.numeric_column("bd")
registered_via = tf.feature_column.numeric_column("registered_via")
membership_days  = tf.feature_column.numeric_column("membership_days")
registration_year = tf.feature_column.numeric_column("registration_year")
registration_month = tf.feature_column.numeric_column("registration_month")
registration_day = tf.feature_column.numeric_column("registration_day")
expiration_year = tf.feature_column.numeric_column("expiration_year")
expiration_month = tf.feature_column.numeric_column("expiration_month")
expiration_day  = tf.feature_column.numeric_column("expiration_day")
song_length = tf.feature_column.numeric_column("song_length")
language  = tf.feature_column.numeric_column("language")
song_year = tf.feature_column.numeric_column("song_year")

print('Continuous columns configured')

Continuous columns configured


In [18]:
# Transformations.
bd_buckets = tf.feature_column.bucketized_column(
    bd, boundaries=[5, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65 ])

msno_song_id = tf.feature_column.crossed_column(
    ["msno", "song_id"], hash_bucket_size=int(1e6))

bd_msno_song_id = tf.feature_column.crossed_column(
    [bd_buckets, "msno", "song_id"], hash_bucket_size=int(1e7))

country_occupation = tf.feature_column.crossed_column(
    ["native_country", "occupation"], hash_bucket_size=int(1e4))

print('Transformations complete')

Transformations complete


### Group feature columns into 2 objects

The wide columns are the sparse, categorical columns that we specified, as well as our hashed, bucket, and feature crossed columns.<br><br>
The deep columns are composed of embedded categorical columns along with the continuous real-valued columns. Column embeddings transform a sparse, categorical tensor into a low-dimensional and dense real-valued vector. The embedding values are also trained along with the rest of the model. For more information about embeddings, see the TensorFlow tutorial on Vector Representations Words, or Word Embedding on Wikipedia.<br><br>
The higher the dimension of the embedding is, the more degrees of freedom the model will have to learn the representations of the features. We are starting with an 8-dimension embedding for simplicity, but later you can come back and increase the dimensionality if you wish.

In [19]:
# Wide columns and deep columns.
##CATEGORICAL_COLUMNS = ["msno", "song_id", "source_system_tab", "source_screen_name", 
##                       "source_type", "gender", "genre_ids", "artist_name",
##                       "composer", "lyricist", "name"]

## Crossed_COLUMNS
## msno_song_id, bd_msno_song_id


################################################################################################################
#CONTINUOUS_COLUMNS = ['city','bd','registered_via','membership_days','registration_month','registration_year',#
#                      'registration_day','expiration_year','expiration_month','expiration_day','song_length', #
#                      'language','song_year']                                                                 #
################################################################################################################
wide_columns = [msno, song_id, source_system_tab,
                 source_screen_name, source_type, gender,
                 genre_ids, artist_name,
                 composer, lyricist,
                 name, msno_song_id,bd_msno_song_id]

#wide_columns = [msno, song_id, source_system_tab,
#      source_screen_name, source_type, gender,
#      genre_ids,
#      name, msno_song_id,bd_msno_song_id]

deep_columns = [
    # Multi-hot indicator columns for columns with fewer possibilities
    tf.feature_column.indicator_column(source_system_tab),
    tf.feature_column.indicator_column(source_screen_name),
    tf.feature_column.indicator_column(source_type),
    tf.feature_column.indicator_column(gender),
    # Embeddings for categories with more possibilities
    tf.feature_column.embedding_column(msno, dimension=100),
    tf.feature_column.embedding_column(song_id, dimension=100),
    tf.feature_column.embedding_column(genre_ids, dimension=8),
    #tf.feature_column.embedding_column(artist_name, dimension=8),
    #tf.feature_column.embedding_column(composer, dimension=8),
    #tf.feature_column.embedding_column(lyricist, dimension=8),
    tf.feature_column.embedding_column(name, dimension=8),
    # Numerical columns
    city,
    bd,
    registered_via,
    membership_days,
    registration_month,
    registration_year,
    registration_day,
    expiration_year,
    expiration_month,
    expiration_day,
    song_length,
    language,
    song_year
]

print('wide and deep columns configured')

wide and deep columns configured


## Create the model

In [20]:
def create_model_dir(model_type):
    return 'models/model_' + model_type

# If new_model=False, pass in the desired model_dir 
def get_model(model_type, new_model=False, model_dir=None):
    if new_model or model_dir is None:
        model_dir = create_model_dir(model_type) # Comment out this line to continue training a existing model
    print("Model directory = %s" % model_dir)
    
    m = None
    
    # Linear Classifier
    if model_type == 'WIDE':
        m = tf.estimator.LinearClassifier(
            model_dir=model_dir, 
            feature_columns=wide_columns)

    # Deep Neural Net Classifier
    if model_type == 'DEEP':
        m = tf.estimator.DNNClassifier(
            model_dir=model_dir,
            feature_columns=deep_columns,
            hidden_units=[100, 50])

    # Combined Linear and Deep Classifier
    if model_type == 'WIDE_AND_DEEP':
        m = tf.estimator.DNNLinearCombinedClassifier(
                model_dir=model_dir,
                linear_feature_columns=wide_columns,
                dnn_feature_columns=deep_columns,
                dnn_hidden_units=[128, 64, 32, 32])
        
    print('estimator built')
    
    return m, model_dir
    
MODEL_TYPE = 'WIDE_AND_DEEP'
model_dir = create_model_dir(model_type=MODEL_TYPE)
m, model_dir = get_model(model_type = MODEL_TYPE, model_dir=model_dir)

Model directory = models/model_WIDE_AND_DEEP
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbe451841d0>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'models/model_WIDE_AND_DEEP', '_save_summary_steps': 100}
estimator built


## Fit the model (train it)

In [21]:
%%time 

#train_file = str("adult.data.csv") 
# "gs://cloudml-public/census/data/adult.data.csv"
# storage.googleapis.com/cloudml-public/census/data/adult.data.csv
#print(df_train)
m.train(input_fn=generate_input_fn(df_train[:-1000]), 
        steps=5000)

print('training done')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP/model.ckpt-5000
INFO:tensorflow:Saving checkpoints for 5001 into models/model_WIDE_AND_DEEP/model.ckpt.
INFO:tensorflow:loss = 735.848, step = 5001
INFO:tensorflow:global_step/sec: 68.9668
INFO:tensorflow:loss = 1187.5, step = 5101 (1.450 sec)
INFO:tensorflow:global_step/sec: 87.2534
INFO:tensorflow:loss = 2120.15, step = 5201 (1.146 sec)
INFO:tensorflow:global_step/sec: 85.9654
INFO:tensorflow:loss = 6639.23, step = 5301 (1.163 sec)
INFO:tensorflow:global_step/sec: 85.9558
INFO:tensorflow:loss = 570.86, step = 5401 (1.163 sec)
INFO:tensorflow:global_step/sec: 85.5207
INFO:tensorflow:loss = 7585.72, step = 5501 (1.170 sec)
INFO:tensorflow:global_step/sec: 85.7806
INFO:tensorflow:loss = 686.797, step = 5601 (1.165 sec)
INFO:tensorflow:global_step/sec: 86.7813
INFO:tensorflow:loss = 1231.41, step = 5701 (1.152 sec)
INFO:tensorflow:global_step/sec: 87.0735
INFO:tensorflow:loss 

In [22]:
### test with last 1000 train_data
results = m.evaluate(input_fn=generate_input_fn(df_train), 
                     steps=50)
print('evaluate done')

print('Accuracy: %s' % results['accuracy'])
print(results)

INFO:tensorflow:Starting evaluation at 2017-12-10-15:33:05
INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP/model.ckpt-10000
INFO:tensorflow:Evaluation [1/50]
INFO:tensorflow:Evaluation [2/50]
INFO:tensorflow:Evaluation [3/50]
INFO:tensorflow:Evaluation [4/50]
INFO:tensorflow:Evaluation [5/50]
INFO:tensorflow:Evaluation [6/50]
INFO:tensorflow:Evaluation [7/50]
INFO:tensorflow:Evaluation [8/50]
INFO:tensorflow:Evaluation [9/50]
INFO:tensorflow:Evaluation [10/50]
INFO:tensorflow:Evaluation [11/50]
INFO:tensorflow:Evaluation [12/50]
INFO:tensorflow:Evaluation [13/50]
INFO:tensorflow:Evaluation [14/50]
INFO:tensorflow:Evaluation [15/50]
INFO:tensorflow:Evaluation [16/50]
INFO:tensorflow:Evaluation [17/50]
INFO:tensorflow:Evaluation [18/50]
INFO:tensorflow:Evaluation [19/50]
INFO:tensorflow:Evaluation [20/50]
INFO:tensorflow:Evaluation [21/50]
INFO:tensorflow:Evaluation [22/50]
INFO:tensorflow:Evaluation [23/50]
INFO:tensorflow:Evaluation [24/50]
INFO:tensorflow:Evaluati

## Make Prediction

In [23]:
id = df_test['id'].values
#del df_test['id']

In [None]:
%%time
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=df_test,
        batch_size=1,
        num_epochs=1,
        shuffle=False)
#  def predict(self, x=None, input_fn=None, batch_size=None, outputs=None,
#              as_iterable=True):
predictions = m.predict(input_fn=predict_input_fn)
container = list(predictions)
#for prediction in predictions[:100]:
#    print(prediction['probabilities'])

INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP/model.ckpt-10000


In [None]:
%%time
count = 0
for prediction in predictions:
    count += 1
    container.append(prediction['probabilities'][1])
    if count % 10000 == 0 :
        print(count)

INFO:tensorflow:Restoring parameters from models/model_WIDE_AND_DEEP/model.ckpt-5000
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000


---

### Save to submission.csv

In [None]:
cur_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
file_name = 'submission_' + cur_time + '.csv'

subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test_1

subm.to_csv(data_path + file_name, index=False, float_format = '%.5f')
print('saved as ' + data_path + file_name)