In [50]:
import pandas as pd
from datetime import datetime
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [51]:
df = pd.read_csv('advertising.csv')

In [52]:
def convert_datetime(date):
    return datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

df['Datetime'] = df['Timestamp'].apply(lambda row: convert_datetime(row))

In [53]:
df.set_index('Datetime', inplace=True)
df.drop('Timestamp', axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Clicked on Ad
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-03-27 00:53:11,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,0
2016-04-04 01:39:02,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,0
2016-03-13 20:35:42,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,0
2016-01-10 02:31:19,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,0
2016-06-03 03:36:18,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,0


# Statistically Significant Train/Test File Creation

In [114]:
orig_stat_significant = df.filter(['Datetime', 'Daily Time Spent on Site', 'Area Income', 'Daily Internet Usage', 'Age'], axis=1)

In [115]:
orig_stat_significant = StandardScaler().fit_transform(orig_stat_significant)

In [121]:
y = np.array(df['Clicked on Ad'])

In [124]:
orig_stat_significant = np.insert(orig_stat_significant, 4, y, axis=1)

In [125]:
orig_stat_significant

array([[ 0.24926659,  0.50969109,  1.73403   , -0.11490498,  0.        ],
       [ 0.96113227,  1.00253021,  0.31380538, -0.57042523,  0.        ],
       [ 0.28208309,  0.35694859,  1.28758905, -1.13982553,  0.        ],
       ...,
       [-0.84377541, -0.93857029, -1.35892388,  1.707176  ,  1.        ],
       [-0.59638946, -0.97548353,  0.18117208, -1.93698596,  0.        ],
       [-1.26155474, -1.87383208, -0.0376045 , -1.13982553,  1.        ]])

In [126]:
orig_stat_significant = pd.DataFrame(data=orig_stat_significant, columns=['Daily Time Spent on Site', 'Area Income', 'Daily Internet Usage', 'Age', 'Clicked on Ad'])

In [129]:
orig_stat_significant.head()

Unnamed: 0,Daily Time Spent on Site,Area Income,Daily Internet Usage,Age,Clicked on Ad
0,0.249267,0.509691,1.73403,-0.114905,0.0
1,0.961132,1.00253,0.313805,-0.570425,0.0
2,0.282083,0.356949,1.287589,-1.139826,0.0
3,0.577432,-0.014456,1.50158,-0.798185,0.0
4,0.212664,1.408868,1.038731,-0.114905,0.0


In [130]:
orig_stat_significant.to_csv('orig_stat_sig.csv', index=False)

# New + Statistically Significant Column File Creation

In [131]:
class DateFrameSelector(TransformerMixin, BaseEstimator):
    def __init__(self, attribs):
        self.attribs = attribs
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.attribs].to_numpy()

In [132]:
class string_info_creation(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.ad_topic_line_idx = 4
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        lengths = [len(line) for line in X[:, self.ad_topic_line_idx]]
        words = [self.number_of_words(string) for string in X[:, self.ad_topic_line_idx]]
        
        lengths = np.array(lengths)
        words = np.array(words)
        
        X = np.delete(X, self.ad_topic_line_idx, 1)
        return np.c_[X, lengths, words]

    def number_of_words(self, string):
        return string.count(' ') + 1

In [133]:
class new_column_creation(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.site_time_idx = 0
        self.age_idx = 1
        self.area_income_idx = 2
        self.internet_time_idx = 3
        self.length_idx = 4
        self.words_idx = 5
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        site_internet_time_ratio = X[:, self.site_time_idx] / X[:, self.internet_time_idx]
        age_divided_site_time = X[:, self.age_idx] / X[:, self.site_time_idx]
        income_divided_site_time = X[:, self.area_income_idx] / X[:, self.site_time_idx]
        length_divided_site_time = X[:, self.length_idx] / X[:, self.site_time_idx]
        length_divided_age = X[:, self.length_idx] / X[:, self.age_idx]
        income_divided_length = X[:, self.area_income_idx] / X[:, self.length_idx]
        
        X = np.delete(X, self.length_idx, 1)
        X = np.delete(X, self.words_idx - 1, 1)
        
        return np.c_[X, site_internet_time_ratio, age_divided_site_time, income_divided_site_time, length_divided_site_time, length_divided_age, income_divided_length]

In [134]:
num_cols = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Ad Topic Line']

num_pipeline = Pipeline([
    ('dateframe_selector', DateFrameSelector(num_cols)),
    ('string_info', string_info_creation()),
    ('column_creation', new_column_creation()),
    ('standard scaler', StandardScaler())
])

label_pipeline = Pipeline([
  ('dateframe_selector', DateFrameSelector(['Clicked on Ad'])),  
])

In [135]:
full_pipeline = FeatureUnion([
    ('num_pipeline', num_pipeline),
    ('label pipeline', label_pipeline)
])

In [136]:
data = full_pipeline.fit_transform(df)

In [138]:
cols = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Site/Internet', 'Age/Site', 'Income/Site', 'Length/Site', 'Length/Age', 'Income/Length', 'Clicked on Ad']

feature_engineered = pd.DataFrame(data, columns=cols)
feature_engineered.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Site/Internet,Age/Site,Income/Site,Length/Site,Length/Age,Income/Length,Clicked on Ad
0,0.249267,-0.114905,0.509691,1.73403,-1.056207,-0.371712,0.029738,-0.303311,-0.044433,0.244288,0.0
1,0.961132,-0.570425,1.00253,0.313805,0.414323,-0.81595,-0.125672,-0.646126,0.370412,0.625646,0.0
2,0.282083,-1.139826,0.356949,1.287589,-0.807398,-0.860389,-0.098906,-0.463918,0.813883,0.341744,0.0
3,0.577432,-0.798185,-0.014456,1.50158,-0.72803,-0.798689,-0.530684,-0.274248,0.963123,-0.417752,0.0
4,0.212664,-0.114905,1.408868,1.038731,-0.712515,-0.355932,0.683558,-0.644242,-0.517234,1.675299,0.0


In [140]:
feature_engineered.to_csv('feature_engineered_cols.csv', index=False)