**There are several ways of timestamp transformation:**
1. timestamp -> number (example: Unix epoch time)
2. timestamp -> parsed timestamp (year, mon, day, etc.) -> One Hot Encoding
3. timestamp -> parsed timestamp (year, mon, day, etc.) -> circular variable cos/sin projection

Which one is better?


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

timestamp to epoch

In [None]:
df = pd.read_csv("../input/advertising.csv")

In [None]:
df = df[['Timestamp', 'Clicked on Ad']]
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 
df['Timestamp'] = df['Timestamp'].astype(np.int64)//10**9
df.head()

In [None]:
target = df['Clicked on Ad']
train = df['Timestamp'].values.reshape(-1, 1)

In [None]:
logreg = LogisticRegression(solver='lbfgs')
cross_val_score(logreg, train, target, cv=10, scoring='accuracy')               

means one click before some datetime, one ignore after (or vice versa, 50/50)

timestamp -> One Hot Encoding


In [None]:
df = pd.read_csv("../input/advertising.csv")
df = df[['Timestamp', 'Clicked on Ad']]
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day     
df['Hour'] = df['Timestamp'].dt.hour   
df['Minute'] = df['Timestamp'].dt.minute  
df['Second'] = df['Timestamp'].dt.second 
df["Weekday"] = df['Timestamp'].dt.dayofweek 

df.head()


In [None]:
train = pd.get_dummies(df, columns = ['Year', 'Month' ,'Day', 'Hour', 'Minute', 'Second', 'Weekday'], drop_first=True)
train.drop(['Clicked on Ad', 'Timestamp'], axis = 1, inplace=True)

rfecv = RFECV(estimator=logreg, step=1, cv=10, scoring='accuracy')
rfecv.fit(train, target)

print("Optimal number of features: %d" % rfecv.n_features_)
print('Best features:', ", ".join(list(train.columns[rfecv.support_])))

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

train = train[list(train.columns[rfecv.support_])]

In [None]:
cross_val_score(logreg, train, target, cv=10, scoring='accuracy').mean()  

timestamp -> cos/sin projection

In [None]:
df = pd.read_csv("../input/advertising.csv")
df = df[['Timestamp', 'Clicked on Ad']]
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day     
df['Hour'] = df['Timestamp'].dt.hour   
df['Minute'] = df['Timestamp'].dt.minute  
df['Second'] = df['Timestamp'].dt.second 
df["Weekday"] = df['Timestamp'].dt.dayofweek 

df['Month_sin'] = np.sin(2*np.pi*(df['Month']-1)/12)
df['Month_cos'] = np.cos(2*np.pi*(df['Month']-1)/12)

df['Day_sin'] = np.sin(2*np.pi*(df['Day']-1)/30)
df['Day_cos'] = np.cos(2*np.pi*(df['Day']-1)/30)

df['Hour_sin'] = np.sin(2*np.pi*(df['Hour'])/24)
df['Hour_cos'] = np.cos(2*np.pi*(df['Hour'])/24)

df['Minute_sin'] = np.sin(2*np.pi*(df['Minute'])/60)
df['Minute_cos'] = np.cos(2*np.pi*(df['Minute'])/60)

df['Second_sin'] = np.sin(2*np.pi*(df['Second'])/60)
df['Second_cos'] = np.cos(2*np.pi*(df['Second'])/60)

df['Weekday_sin'] = np.sin(2*np.pi*(df['Weekday'])/7)
df['Weekday_cos'] = np.cos(2*np.pi*(df['Weekday'])/7)

df.head()

In [None]:
train = df.drop(['Clicked on Ad', 'Timestamp', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Weekday'], axis = 1)

std = StandardScaler()
scaled = std.fit_transform(train[['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'Hour_sin', 'Hour_cos', 'Minute_sin','Minute_cos', 'Second_sin', 'Second_cos', 'Weekday_sin', 'Weekday_cos']])
scaled = pd.DataFrame(scaled, columns=['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'Hour_sin', 'Hour_cos', 'Minute_sin','Minute_cos', 'Second_sin', 'Second_cos', 'Weekday_sin', 'Weekday_cos'])
train[scaled.columns] = scaled[scaled.columns]
train.describe()

train.describe()

In [None]:
rfecv = RFECV(estimator=logreg, step=1, cv=10, scoring='accuracy')
rfecv.fit(train, target)

print("Optimal number of features: %d" % rfecv.n_features_)
print('Best features:', ", ".join(list(train.columns[rfecv.support_])))

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

train = train[list(train.columns[rfecv.support_])]

In [None]:
cross_val_score(logreg, train, target, cv=10, scoring='accuracy').mean() 

Conclusion - circular variable cos/sin projection is a strong approach, but one hot encoding method is better for this task (maybe holidays dict should be used)