# This notebook is for self-learning
## Copyright 2020 Sang Wook Kim
## Data source: https://www.kaggle.com/nikdavis/steam-store-games?select=steam_media_data.csv

In [1]:
# Import relevant modules
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
print("Imported modules.")

Imported modules.


In [2]:
# Import database. Since this is practice, I will look a whole dataset first.
df_steam = pd.read_csv("/home/sang/UVM/database/Steam/steam.csv")

In [3]:
df_steam.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [4]:
df_steam = df_steam.drop(labels =['appid','name'], axis = 1)
df_steam.head()

Unnamed: 0,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [5]:
# Quick view
df_steam.describe()

Unnamed: 0,english,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,price
count,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0
mean,0.981127,0.354903,45.248864,1000.559,211.027147,149.804949,146.05603,6.078193
std,0.136081,2.406044,352.670281,18988.72,4284.938531,1827.038141,2353.88008,7.874922
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,6.0,2.0,0.0,0.0,1.69
50%,1.0,0.0,7.0,24.0,9.0,0.0,0.0,3.99
75%,1.0,0.0,23.0,126.0,42.0,0.0,0.0,7.19
max,1.0,18.0,9821.0,2644404.0,487076.0,190625.0,190625.0,421.99


# 1. Insights and data preperation
## label : bucketized number of owners
## features
highly informative: 'categories', 'genres', 'steamspy_tags', 'positive_ratings', 'negative_ratings', 'price'

less informative: 'english', 'platforms', 'required_age','achievements'

bias risk: 'release_date', 'developer', 'publisher'

questions

1. "Does the release_date has strong correlation with # of owners?"

2. "Does developer or publisher independently affect # of owners?" - Credit? PR?

In [6]:
bucketized_label = sorted(list(pd.unique(df_steam.owners)))
bucketized_label

['0-20000',
 '100000-200000',
 '1000000-2000000',
 '10000000-20000000',
 '100000000-200000000',
 '20000-50000',
 '200000-500000',
 '2000000-5000000',
 '20000000-50000000',
 '50000-100000',
 '500000-1000000',
 '5000000-10000000',
 '50000000-100000000']

In [7]:
# transfer labels to its boundary
df_owners = df_steam['owners'].apply(lambda x: float(x.split('-')[1]))
df_owners_log = df_steam['owners'].apply(lambda x: np.log(float(x.split('-')[1])))

In [8]:
df_owners_log.unique()

array([16.81124283, 16.11809565, 15.42494847, 17.72753356, 19.11382792,
       18.42068074, 10.81977828, 13.81551056, 12.20607265, 11.51292546,
       14.50865774, 13.12236338,  9.90348755])

In [9]:
df_owners_buckets = df_owners_log.apply(lambda x: round(x)-9)
df_owners_buckets.unique()

array([ 8,  7,  6,  9, 10,  2,  5,  3,  4,  1])

## To answer the first question, let's find the mean value of the dates
("Does the release_date has strong correlation with # of owners?")

In [10]:
from datetime import datetime

In [11]:
question1 = pd.concat(
    [df_owners,df_steam.release_date.apply(lambda x: datetime.fromisoformat(x).timestamp())],
    axis = 1,)

In [12]:
mean_date = question1.groupby('owners').mean()
mean_date['release_date_format'] = mean_date.release_date.apply(
    lambda x: datetime.fromtimestamp(int(x)).isoformat()
    )
mean_date

Unnamed: 0_level_0,release_date,release_date_format
owners,Unnamed: 1_level_1,Unnamed: 2_level_1
20000.0,1505201000.0,2017-09-12T03:23:00
50000.0,1457225000.0,2016-03-05T19:40:01
100000.0,1444877000.0,2015-10-14T22:45:07
200000.0,1432374000.0,2015-05-23T05:36:47
500000.0,1414970000.0,2014-11-02T18:18:37
1000000.0,1392958000.0,2014-02-20T23:51:06
2000000.0,1382017000.0,2013-10-17T09:29:10
5000000.0,1379642000.0,2013-09-19T21:57:49
10000000.0,1325347000.0,2011-12-31T10:50:52
20000000.0,1296316000.0,2011-01-29T10:51:25


## The release_date may be related to the target label, but it's not very useful.

# 2. data selection

In [13]:
# Our label again,
df_owners_buckets.describe()

count    27075.000000
mean         1.663564
std          1.188275
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         10.000000
Name: owners, dtype: float64

In [14]:
# quick view of categorical features
df_categs = df_steam[['platforms','genres','steamspy_tags']]

In [15]:
def semicolon_spliter(x):
    return np.array(x.split(';'))

def voca_list(series):
    voca = []
    for items in series:
        for item in items:
            if item not in voca:
                voca.append(item)
    return voca

In [16]:
# 'platforms','genres',and 'steamspy_tags' have one or more values
split_platforms = df_categs['platforms'].apply(semicolon_spliter)
split_genres = df_categs['genres'].apply(semicolon_spliter)
split_steamspy_tags = df_categs['steamspy_tags'].apply(semicolon_spliter)

In [17]:
split_platforms

0        [windows, mac, linux]
1        [windows, mac, linux]
2        [windows, mac, linux]
3        [windows, mac, linux]
4        [windows, mac, linux]
                 ...          
27070                [windows]
27071                [windows]
27072                [windows]
27073           [windows, mac]
27074           [windows, mac]
Name: platforms, Length: 27075, dtype: object

In [18]:
list_platforms = voca_list(split_platforms)
list_genres = voca_list(split_genres)
list_steamspy_tags = voca_list(split_steamspy_tags)
print('length of voca list:')
print('platforms :', len(list_platforms))
print('genres :', len(list_genres))
print('steamspy tag :', len(list_steamspy_tags))
print('show voca list if its length is less than 5')
for lst in (list_platforms,list_genres,list_steamspy_tags):
    if len(lst) < 5:
        print(lst)

length of voca list:
platforms : 3
genres : 29
steamspy tag : 339
show voca list if its length is less than 5
['windows', 'mac', 'linux']


### platform and tag are not single-value features. For simplicity, I will drop 'steamspy tag'

In [19]:
# One way to encode this feature is generating new columns for each items in the list
def indicator_df_from_series(series, item_list):
    frame = {}
    for item in tuple(item_list):
        frame[item] = series.apply(lambda x: int(item in x))
    result = pd.DataFrame(frame)
    return result

# Another way to encode platform feature
def platforms_code(x):
    if 'windows' in x:
        if len(x) > 1:
            return 'w+'
        else:
            return 'onlywin'
    else:
        return 'else'
    
# regularize dimension of data
def regularize_data(x, voca_list):
    counter = len(voca_list) - len(list(x))
    x = list(x)
    if counter < 0:
        print('Invalid voca_list')
        return x
    else:
        for i in range(counter):
            x.append(0)
        return x

In [20]:
# 1. Use platforms_code and just select the first component of 'genres'
df_platforms = split_platforms.apply(platforms_code)
df_genres = split_genres.apply(lambda x: x[0])

### Take home question: Is there any better configuration for these features?

In [21]:
df_target = df_steam[['price','positive_ratings','negative_ratings',]]
df_target = df_target.join(df_platforms)
df_target = df_target.join(df_genres)
df_target = df_target.join(df_owners_buckets)

In [22]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_target, test_size=0.1)
train, val = train_test_split(train, test_size=0.2)
print(
    "We have %d samples for training, %d for validation, and %d for test"
    % (len(train), len(val), len(test))
)

We have 19493 samples for training, 4874 for validation, and 2708 for test


In [23]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("owners")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train)
val_ds = dataframe_to_dataset(val)

In [24]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [25]:
from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import StringLookup


def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature


def encode_categorical_feature(feature, name, dataset, is_string):
    lookup_class = StringLookup if is_string else IntegerLookup
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature

In [26]:
print(train.columns)
print(len(train.columns))

Index(['price', 'positive_ratings', 'negative_ratings', 'platforms', 'genres',
       'owners'],
      dtype='object')
6


In [27]:
# Categorical features encoded as integers
platforms = keras.Input(shape=(1,), name="platforms", dtype="string")
genres = keras.Input(shape=(1,), name="genres", dtype="string")

# Numerical features
price = keras.Input(shape=(1,), name="price")
positive_ratings = keras.Input(shape=(1,), name="positive_ratings")
negative_ratings = keras.Input(shape=(1,), name="negative_ratings")

all_inputs = [
    platforms,
    genres,
    price,
    positive_ratings,
    negative_ratings,
]

# Integer categorical features
platforms_encoded = encode_categorical_feature(platforms, "platforms", train_ds, True)

# String categorical features
genres_encoded = encode_categorical_feature(genres, "genres", train_ds, True)

# Numerical features
price_encoded = encode_numerical_feature(price, "price", train_ds)
positive_ratings_encoded = encode_numerical_feature(positive_ratings, "positive_ratings", train_ds)
negative_ratings_encoded = encode_numerical_feature(negative_ratings, "negative_ratings", train_ds)

all_features = layers.concatenate(
    [
        platforms_encoded,
    genres_encoded,
    price_encoded,
    positive_ratings_encoded,
    negative_ratings_encoded,
    ]
)

In [28]:
all_features

<KerasTensor: shape=(None, 33) dtype=float32 (created by layer 'concatenate')>

In [29]:
x = None
x = layers.Dense(8, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
x = layers.Dense(8, activation="relu")(x)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1)(x)
model = None
model = keras.Model(all_inputs, output)

In [30]:
LEARNING_RATE = 0.005 #@param
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = LEARNING_RATE),  
              loss="mean_squared_error",
              metrics=[tf.keras.metrics.MeanSquaredError()]
             )

In [31]:
# keras.utils.plot_model(model, show_shapes=True)

In [32]:
model.fit(train_ds, epochs=50, validation_data=val_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8920199c70>

# This is bad model
# How can I improve it?

In [33]:
import scipy.stats as stats

In [34]:
vfunc = np.vectorize(lambda x : 1/(1 + np.exp(-x)))

In [35]:
df_price = vfunc(stats.zscore(df_steam['price']))
df_pr = vfunc(stats.zscore(df_steam['positive_ratings']))
df_nr = vfunc(stats.zscore(df_steam['negative_ratings']))
df_price = pd.Series(data=df_price, name='price', dtype=float,)
df_pr = pd.Series(data=df_pr, name='positive_ratings', dtype=float,)
df_nr = pd.Series(data=df_nr, name='negative_ratings', dtype=float,)

In [36]:
df_platforms
df_genres = split_genres.apply(lambda x: regularize_data(x, list_genres))
df_genres_1 = df_genres.apply(lambda x: str(x[0]))
df_genres_2 = df_genres.apply(lambda x: str(x[1]))
df_genres_3 = df_genres.apply(lambda x: str(x[2]))
df_genres_1 = df_genres_1.rename("genre_1")
df_genres_2 = df_genres_2.rename("genre_2")
df_genres_3 = df_genres_3.rename("genre_3")

In [37]:
labels = df_owners_buckets

In [38]:
# df_target = pd.concat(
#     [df_price,df_pr,df_nr,df_platforms,df_genres,labels,],
#     axis = 1)

df_target = pd.concat(
    [df_price,df_pr,df_nr,df_platforms,df_genres_1,df_genres_2,df_genres_3,labels,],
    axis = 1)

In [39]:
df_target

Unnamed: 0,price,positive_ratings,negative_ratings,platforms,genre_1,genre_2,genre_3,owners
0,0.535238,0.998507,0.674807,w+,Action,0,0,8
1,0.434092,0.530474,0.524600,w+,Action,0,0,7
2,0.434092,0.531759,0.510907,w+,Action,0,0,7
3,0.434092,0.503587,0.503266,w+,Action,0,0,7
4,0.434092,0.555716,0.504491,w+,Action,0,0,7
...,...,...,...,...,...,...,...,...
27070,0.376026,0.486869,0.487690,onlywin,Adventure,Casual,Indie,1
27071,0.364185,0.486935,0.487748,onlywin,Action,Adventure,Indie,1
27072,0.434092,0.486830,0.487748,onlywin,Action,Casual,Indie,1
27073,0.471832,0.486856,0.487690,w+,Adventure,Casual,Indie,1


In [40]:
df_target['genre_2'].iloc[0]

'0'

In [41]:
train, test = train_test_split(df_target, test_size=0.1)
train, val = train_test_split(train, test_size=0.2)
print(
    "We have %d samples for training, %d for validation, and %d for test"
    % (len(train), len(val), len(test))
)

We have 19493 samples for training, 4874 for validation, and 2708 for test


In [42]:
train_ds = dataframe_to_dataset(train)
val_ds = dataframe_to_dataset(val)
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [43]:
train_ds

<BatchDataset shapes: ({price: (None,), positive_ratings: (None,), negative_ratings: (None,), platforms: (None,), genre_1: (None,), genre_2: (None,), genre_3: (None,)}, (None,)), types: ({price: tf.float64, positive_ratings: tf.float64, negative_ratings: tf.float64, platforms: tf.string, genre_1: tf.string, genre_2: tf.string, genre_3: tf.string}, tf.int64)>

In [44]:
len(df_genres[0])

29

In [45]:
# Categorical features encoded as integers
platforms = keras.Input(shape=(1,), name="platforms", dtype="string")
# genres = keras.Input(shape=(1,), name="genres")
genre_1 = keras.Input(shape=(1,), name="genre_1", dtype="string")
genre_2 = keras.Input(shape=(1,), name="genre_2", dtype="string")
genre_3 = keras.Input(shape=(1,), name="genre_3", dtype="string")

# Numerical features
price = keras.Input(shape=(1,), name="price")
positive_ratings = keras.Input(shape=(1,), name="positive_ratings")
negative_ratings = keras.Input(shape=(1,), name="negative_ratings")

all_inputs = [
    platforms,
    genre_1,
    genre_2,
    genre_3,
    price,
    positive_ratings,
    negative_ratings,
]

# # String categorical features
# platforms_col = tf.feature_column.categorical_column_with_vocabulary_list(
#       "platforms", ['w+', 'onlywin', 'else'])

# platforms_encoded = tf.feature_column.indicator_column(platforms_col)

# # String categorical features
# genres_col = tf.feature_column.categorical_column_with_vocabulary_list(
#       "genres", list_genres)

# genres_encoded = tf.feature_column.indicator_column(genres_col)

# # Numerical features
# price_encoded = tf.feature_column.numeric_column("price")
# positive_ratings_encoded = tf.feature_column.numeric_column("positive_ratings")
# negative_ratings_encoded = tf.feature_column.numeric_column("negative_ratings")

# all_features = layers.DenseFeatures(
#     [
#         platforms_encoded,
#     genres_encoded,
#     price_encoded,
#     positive_ratings_encoded,
#     negative_ratings_encoded,
#     ]
# )

# Integer categorical features
platforms_encoded = encode_categorical_feature(platforms, "platforms", train_ds, True)

# String categorical features
genre_encoded_1 = encode_categorical_feature(genre_1, "genre_1", train_ds, True)
genre_encoded_2 = encode_categorical_feature(genre_2, "genre_2", train_ds, True)
genre_encoded_3 = encode_categorical_feature(genre_3, "genre_3", train_ds, True)


# Numerical features
price_encoded = encode_numerical_feature(price, "price", train_ds)
positive_ratings_encoded = encode_numerical_feature(positive_ratings, "positive_ratings", train_ds)
negative_ratings_encoded = encode_numerical_feature(negative_ratings, "negative_ratings", train_ds)

all_features = layers.concatenate(
    [
        platforms_encoded,
        genre_encoded_1,
        genre_encoded_2,
        genre_encoded_3,
        price_encoded,
        positive_ratings_encoded,
        negative_ratings_encoded,
    ]
)

In [46]:
all_features

<KerasTensor: shape=(None, 86) dtype=float32 (created by layer 'concatenate_1')>

In [47]:
x = all_features
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(16, activation="relu")(x)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1)(x)
model = None
model = keras.Model(all_inputs, output)

In [48]:
LEARNING_RATE = 0.005 #@param
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = LEARNING_RATE),  
              loss="mean_squared_error",
              metrics=[tf.keras.metrics.MeanSquaredError()]
             )

In [49]:
model.fit(train_ds, epochs=50, validation_data=val_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8920254b50>

# Somewhat better

# Let us evaluate our model

In [51]:
test_ds = dataframe_to_dataset(test)
test_ds = test_ds.batch(32)

In [54]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(test_ds)
print("test loss, test mean_squared_error:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(test_ds)
print("predictions shape:", predictions.shape)

Evaluate on test data
test loss, test mean_squared_error: [0.29332077503204346, 0.29332077503204346]
Generate predictions for 3 samples
predictions shape: (2708, 1)


In [67]:
print("label          predicted")
print("value          value")
print("in original   in original")
print("--------------------------------------")

for i in range(30):
    print ("%f %f" % ( test['owners'].iloc[i], predictions[i] ))

label          predicted
value          value
in original   in original
--------------------------------------
3.000000 2.992354
1.000000 1.115811
1.000000 1.192207
1.000000 2.616232
1.000000 1.220362
1.000000 1.092408
1.000000 1.189553
3.000000 1.108123
1.000000 3.548675
1.000000 1.117940
4.000000 1.177535
2.000000 4.128572
1.000000 4.449903
1.000000 1.146051
1.000000 1.193996
1.000000 2.397537
1.000000 3.128413
3.000000 1.278032
1.000000 3.065299
1.000000 1.075129
1.000000 1.146791
2.000000 1.094634
4.000000 0.998720
1.000000 3.037426
1.000000 2.760513
3.000000 1.203378
2.000000 2.312987
1.000000 1.309136
1.000000 1.142322
3.000000 1.818723


## I used bucketized label and designed regression model, not classification model. It's because the numer of owner is actually not a 'class' but 'scalar'. As a result, the prediction gives 'float' number. This is not perfect model, which is due to lack of good information. Also, positive and negative rate cannot be collected before they are released. Thus, this model is not good for prediction model.