In [1]:
import math
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier
%matplotlib inline

In [2]:
df_train = pd.read_csv('../Kaggle_Datasets/Facebook/train.csv')
df_test = pd.read_csv('../Kaggle_Datasets/Facebook/test.csv')

In [3]:
mini_df = df_train[(df_train.x > 7.0) & (df_train.x < 7.5) & (df_train.y > 3.0) & (df_train.y < 3.25)]

In [4]:
df_train = None
df_test = None

In [5]:
def apply_accu_class(x):
    rise_ranges = ((50, 90), (130, 160), (220, 310), (380, 600))
    fall_ranges = ((0, 40), (110, 120), (190, 210), (320, 370), )
    peak_ranges = ((90, 110), (160, 190), (310, 320), )
    trough_ranges = ((40, 50), (120, 130), (210, 220), (370, 380))
    class_dict = {rise_ranges: 1, fall_ranges: -1, peak_ranges: 2, trough_ranges: -2}
    for rang in [rise_ranges, fall_ranges, peak_ranges, trough_ranges]:
        for a, b in rang:
            if a <= x < b:
                return class_dict[rang]

In [6]:
def mod_df(mini_df):
    mini_df.loc[:, 'hour'] = mini_df.time // 60 % 24 + 1
    mini_df.loc[:, 'day'] = mini_df.time // (60*24) % 7 + 1
    mini_df.loc[:, 'week'] = mini_df.time // (60*24*7) % 52 + 1
    mini_df.loc[:, 'month'] = mini_df.time // (60*24*30) % 12 + 1
    mini_df.loc[:, 'year'] = mini_df.time // (60*24*365) + 1

    mini_df.loc[:, 'hours'] = mini_df.time // 60 + 1
    mini_df.loc[:, 'days'] = mini_df.time // (60*24) + 1
    mini_df.loc[:, 'weeks'] = mini_df.time // (60*24*7) + 1
    mini_df.loc[:, 'months'] = mini_df.time // (60*24*30) + 1
    mini_df.loc[:, 'years'] = mini_df.time // (60*24*365) + 1
    
    mini_df.loc[:, 'hour_float'] = mini_df.time / 60 % 24 + 1
    mini_df.loc[:, 'day_float'] = mini_df.time / (60*24) % 7 + 1
    mini_df.loc[:, 'week_float'] = mini_df.time / (60*24*7) % 52 + 1
    mini_df.loc[:, 'month_float'] = mini_df.time / (60*24*30) % 12 + 1
    mini_df.loc[:, 'year_float'] = mini_df.time / (60*24*365) + 1

    mini_df.loc[:, 'accu_class'] = mini_df.days.apply(apply_accu_class)
    mini_df.loc[:, 'log2_accuracy'] = np.log2(mini_df.accuracy) * 10
    mini_df.loc[:, 'log10_accuracy'] = np.log10(mini_df.accuracy) * 10
    mini_df.loc[:, 'log2_accuracy_int'] = (np.log2(mini_df.accuracy) * 10).astype(int)
    mini_df.loc[:, 'log10_accuracy_int'] = (np.log10(mini_df.accuracy) * 10).astype(int)
    
    return mini_df

In [7]:
mini_df = mod_df(mini_df)
mini_df.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,day,week,month,...,hour_float,day_float,week_float,month_float,year_float,accu_class,log2_accuracy,log10_accuracy,log2_accuracy_int,log10_accuracy_int
301,301,7.3815,3.2452,12,467320,4836326665,13,3,47,11,...,13.666667,3.527778,47.361111,11.817593,1.889117,-1,35.849625,10.791812,35,10
2339,2339,7.3378,3.0791,8,215857,6676464189,22,3,22,5,...,22.616667,3.900694,22.414385,5.99669,1.410687,1,30.0,9.0309,30,9
2984,2984,7.2308,3.0956,61,588233,8100169726,12,3,7,2,...,12.883333,3.495139,7.356448,2.616505,2.119165,1,59.307373,17.853298,59,17
2993,2993,7.1892,3.2495,13,47081,7896926684,17,5,5,2,...,17.683333,5.695139,5.670734,2.089838,1.089576,-1,37.004397,11.139434,37,11
3336,3336,7.374,3.0347,76,649934,6144260946,9,4,13,4,...,9.233333,4.343056,13.477579,4.044769,2.236556,1,62.479275,18.808136,62,18


In [7]:
mini_df.columns

Index([u'row_id', u'x', u'y', u'accuracy', u'time', u'place_id', u'hour',
       u'day', u'week', u'month', u'year', u'hours', u'days', u'weeks',
       u'months', u'years', u'hour_float', u'day_float', u'week_float',
       u'month_float', u'year_float', u'accu_class', u'log2_accuracy',
       u'log10_accuracy', u'log2_accuracy_int', u'log10_accuracy_int'],
      dtype='object')

In [8]:
len(mini_df.place_id.unique())

1347

In [8]:
train, test = train_test_split(mini_df, random_state=88)

In [10]:
attributes = [u'x', u'y', u'accuracy', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'log2_accuracy', u'log10_accuracy',]

In [11]:
model = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [12]:
model.fit(train.loc[:, attributes], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [13]:
model.score(test.loc[:, attributes], test.place_id)

0.53841488071168619

In [14]:
# Using hours, days... instead of hour, day...
attributes_2 = [u'x', u'y', u'accuracy', u'time', u'hours', u'days', u'weeks', u'months',
       u'year', u'log2_accuracy', u'log10_accuracy',]

In [15]:
model_2 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [16]:
model_2.fit(train.loc[:, attributes_2], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [17]:
model_2.score(test.loc[:, attributes_2], test.place_id)

0.44541043267286695

In [18]:
# Removing log2 and log10 accuracy
attributes_3 = [u'x', u'y', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'accuracy',]

In [19]:
model_3 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [20]:
model_3.fit(train.loc[:, attributes_3], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [21]:
model_3.score(test.loc[:, attributes_3], test.place_id)

0.55388192478770726

In [22]:
# Using only log10 instead of accuracy
attributes_4 = [u'x', u'y', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'log10_accuracy',]

In [23]:
model_4 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [24]:
model_4.fit(train.loc[:, attributes_4], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [25]:
model_4.score(test.loc[:, attributes_4], test.place_id)

0.55448847553578651

In [26]:
# Using only log2 instead of accuracy
attributes_4a = [u'x', u'y', u'time', u'hour', u'day', u'week', u'month',
       u'year', u'log2_accuracy',]

In [27]:
model_4a = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [28]:
model_4a.fit(train.loc[:, attributes_4a], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [29]:
model_4a.score(test.loc[:, attributes_4a], test.place_id)

0.55448847553578651

In [33]:
# Removing time
attributes_5 = [u'x', u'y', u'hour', u'day', u'week', u'month',
       u'year', u'log10_accuracy',]

In [34]:
model_5 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [35]:
model_5.fit(train.loc[:, attributes_5], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [36]:
model_5.score(test.loc[:, attributes_5], test.place_id)

0.53983016579053777

In [15]:
# Using float day, week and month instead of rounded ints
attributes_6 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'log2_accuracy',]

In [16]:
model_6 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [17]:
model_6.fit(train.loc[:, attributes_6], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [18]:
model_6.score(test.loc[:, attributes_6], test.place_id)

0.55913869793772741

In [19]:
# Using int accuracy
attributes_7 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'log2_accuracy_int',]

In [20]:
model_7 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [21]:
model_7.fit(train.loc[:, attributes_7], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [22]:
model_7.score(test.loc[:, attributes_7], test.place_id)

0.5612616255560049

In [23]:
# Using all three accuracies
attributes_8 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'accuracy', 'log2_accuracy', 'log10_accuracy']

In [24]:
model_8 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [25]:
model_8.fit(train.loc[:, attributes_8], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [26]:
model_8.score(test.loc[:, attributes_8], test.place_id)

0.54963606955115241

In [27]:
# Using float and integer values for day, hour, time e.t.c
attributes_9 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', 'log2_accuracy', 'hour', 'day', 'week', 'month']

In [28]:
model_9 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [29]:
model_9.fit(train.loc[:, attributes_9], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [30]:
model_9.score(test.loc[:, attributes_9], test.place_id)

0.54741205014152849

In [31]:
# Adding accu_class
attributes_10 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'month_float',
       u'year_float', u'log2_accuracy', 'accu_class']

In [32]:
model_10 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)

In [33]:
model_10.fit(train.loc[:, attributes_10], train.place_id)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
            oob_score=False, random_state=3, verbose=0, warm_start=True)

In [34]:
model_10.score(test.loc[:, attributes_10], test.place_id)

0.55883542256368779

In [9]:
# Keeping only hour, day and week floats. Removing rest
attributes_11 = [u'x', u'y', u'time', u'hour_float', u'day_float', u'week_float', u'log2_accuracy',]
model_11 = RandomForestClassifier(n_estimators=20, warm_start=True, random_state=3, n_jobs=-1)
model_11.fit(train.loc[:, attributes_11], train.place_id)
model_11.score(test.loc[:, attributes_11], test.place_id)

0.55064698746461782