# Feature Selection using global search

In this notebook, we're going to find the optimal combination of date columns. we can't guarantee that all columns which we extracted from the **Dates** will affect positive influence of our model. In this experiment, we try to test every possible combination of dates and select features that will give us best accuracy.

In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
train = pd.read_csv("../data/train.csv")
train.head(3)

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414


## Feature Engineering

### Convert the **Dates** column to numerical columns

In [3]:
from datetime import datetime

total_count = train.shape[0]
count = 0

dates_data = []

for index, row in train["Dates"].iteritems():
    count = count + 1

    if count % 100000 == 0:
        print("processing... {0}/{1}".format(count, total_count))

    date = datetime.strptime(row, "%Y-%m-%d %H:%M:%S")

    dates_data.append({
        "index": index,
        "Dates-Year": date.year,
        "Dates-Month": date.month,
        "Dates-Day": date.day,
        "Dates-Hour": date.hour,
        "Dates-Minute": date.minute,
        "Dates-Second": date.second,
    })
    
dates_dataframe = pd.DataFrame.from_dict(dates_data).astype('int32')
dates_dataframe = dates_dataframe.set_index("index")

dates_columns = ["Dates-Year", "Dates-Month", "Dates-Day", "Dates-Hour", "Dates-Minute", "Dates-Second"]
dates_dataframe = dates_dataframe[dates_columns]

# All "Dates-Second" variable is equal to zero. Therefore, we can remove it.
second_list = dates_dataframe["Dates-Second"].unique()
print("list of seconds = {0}".format(second_list))

dates_dataframe = dates_dataframe.drop("Dates-Second", axis=1)

train = pd.concat([train, dates_dataframe], axis=1)

train.head(3)

processing... 100000/878049
processing... 200000/878049
processing... 300000/878049
processing... 400000/878049
processing... 500000/878049
processing... 600000/878049
processing... 700000/878049
processing... 800000/878049
list of seconds = [0]


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Dates-Year,Dates-Month,Dates-Day,Dates-Hour,Dates-Minute
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,53
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,53
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,2015,5,13,23,33


## Score

In [12]:
from itertools import combinations
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.cross_validation import cross_val_score

base_score = 2.620481

default_feature_names = ["X", "Y"]
default_feature_names = default_feature_names

column_length = dates_dataframe.shape[1]

combination_result = []

for i in range(column_length):
    for column_cases in combinations(dates_dataframe.columns, i+1):
        feature_names = default_feature_names + list(column_cases)
        label_name = "Category"

        train_X = train[feature_names]
        train_y = train[label_name]

        model = BernoulliNB()
        %time score = cross_val_score(model, train_X, train_y, scoring='log_loss', cv=5).mean()
        score = -1.0 * score
        score_difference = score - base_score

        combination_text = ", ".join(column_cases)

        print("Score using \"{0}\" columns".format(combination_text))
        print("BernoulliNB = {0:.6f} ({1:+.6f})".format(score, score_difference))
        
        combination_result.append({
            'combination': combination_text,
            'model': "BernoulliNB",
            "score": score,
        })
        
combination_result_dataframe = pd.DataFrame.from_dict(combination_result)
combination_result_dataframe = combination_result_dataframe.set_index("combination")

combination_result_dataframe = combination_result_dataframe.sort("score")
combination_result_dataframe.head(5)

Score using "Dates-Year" columns
BernoulliNB = 2.680326 (+0.059845)
Score using "Dates-Month" columns
BernoulliNB = 2.680326 (+0.059845)
Score using "Dates-Day" columns
BernoulliNB = 2.680326 (+0.059845)
Score using "Dates-Hour" columns
BernoulliNB = 2.676709 (+0.056228)
Score using "Dates-Minute" columns
BernoulliNB = 2.623992 (+0.003511)
Score using "Dates-Year, Dates-Month" columns
BernoulliNB = 2.680327 (+0.059846)
Score using "Dates-Year, Dates-Day" columns
BernoulliNB = 2.680327 (+0.059846)
Score using "Dates-Year, Dates-Hour" columns
BernoulliNB = 2.676710 (+0.056229)
Score using "Dates-Year, Dates-Minute" columns
BernoulliNB = 2.623993 (+0.003512)
Score using "Dates-Month, Dates-Day" columns
BernoulliNB = 2.680327 (+0.059846)
Score using "Dates-Month, Dates-Hour" columns
BernoulliNB = 2.676710 (+0.056229)
Score using "Dates-Month, Dates-Minute" columns
BernoulliNB = 2.623993 (+0.003512)
Score using "Dates-Day, Dates-Hour" columns
BernoulliNB = 2.676710 (+0.056229)
Score using "



Unnamed: 0_level_0,model,score
combination,Unnamed: 1_level_1,Unnamed: 2_level_1
"Dates-Hour, Dates-Minute",BernoulliNB,2.620478
"Dates-Month, Dates-Hour, Dates-Minute",BernoulliNB,2.620479
"Dates-Year, Dates-Hour, Dates-Minute",BernoulliNB,2.620479
"Dates-Day, Dates-Hour, Dates-Minute",BernoulliNB,2.620479
"Dates-Year, Dates-Day, Dates-Hour, Dates-Minute",BernoulliNB,2.62048


## Result
  * default = 2.620481


  * Select only Dates-Hour and Dates-Minute = **2.620478** (-0.000003)
  * Select only Dates-Month, Dates-Hour and Dates-Minute = 2.620479 (-0.0000022)
  * Select only Dates-Year, Dates-Hour and Dates-Minute = 2.620479 (-0.0000022)
  * Select only Dates-Day, Dates-Hour and Dates-Minute = 2.620479 (-0.000002)
  * Select only Dates-Year, Dates-Day, Dates-Hour and Dates-Minute = 2.620480 (-0.000001)