In [491]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime

In [492]:
ALPHA = 0.05

In [493]:
df = pd.read_csv('advertising.csv')
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


## Quick Feature Additions

In [494]:
# Add the # of words in each ad topic line.

def word_count(string):
    return string.count(" ") + 1

df['Line_Words'] = df['Ad Topic Line'].apply(lambda row: word_count(row))

In [495]:
# Add the length of each ad topic line.

def line_length(string):
    return len(string)

df['Line_Length'] = df['Ad Topic Line'].apply(lambda row: line_length(row))

In [496]:
# Change the Male category to 'Male' and 'Female'

def gender_label_encode(label):
    if label == 0:
        return 'Female'
    else: 
        return 'Male'
    
df['Gender Labeled'] = df['Male'].apply(lambda row: gender_label_encode(row))

In [497]:
# Add a column for the datetime object version of the timestamp

def convert_datetime(time):
    return datetime.strptime(time, '%Y-%m-%d %H:%M:%S')

df['Datetime'] = df['Timestamp'].apply(lambda row: convert_datetime(row))

In [498]:
# Add the month column.

def extract_month(time): # There are only 7 months in this dataset.
    return time.strftime('%B')

df['Month'] = df['Datetime'].apply(lambda row: extract_month(row))

In [499]:
# Add the AM/PM column.

def extract_am_pm(time):
    return time.strftime('%p')

df['AM/PM'] = df['Datetime'].apply(lambda row: extract_am_pm(row))

In [500]:
# Add the hour column.

def extract_hour(time):
    return time.strftime('%H')

df['Hour'] = df['Datetime'].apply(lambda row: extract_hour(row))

In [501]:
# Add the day time column.

def day_time(hour):
    hour = int(hour)
    
    if hour > 5 and hour <= 12:
        return 'morning'
    elif hour > 12 and hour <= 16:
        return 'afternoon'
    elif hour > 16 and hour <= 18:
        return 'evening'
    else:
        return 'night'

df['Day Time'] = df['Hour'].apply(lambda row: day_time(row))

In [502]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Line_Words,Line_Length,Gender Labeled,Datetime,Month,AM/PM,Hour,Day Time
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0,3,34,Female,2016-03-27 00:53:11,March,AM,0,night
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0,3,34,Male,2016-04-04 01:39:02,April,AM,1,night
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0,3,32,Female,2016-03-13 20:35:42,March,PM,20,night
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0,3,37,Male,2016-01-10 02:31:19,January,AM,2,night
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0,3,29,Female,2016-06-03 03:36:18,June,AM,3,night


In [503]:
df.nunique()

Daily Time Spent on Site     900
Age                           43
Area Income                 1000
Daily Internet Usage         966
Ad Topic Line               1000
City                         969
Male                           2
Country                      237
Timestamp                   1000
Clicked on Ad                  2
Line_Words                     4
Line_Length                   35
Gender Labeled                 2
Datetime                    1000
Month                          7
AM/PM                          2
Hour                          24
Day Time                       4
dtype: int64

In [504]:
cats = list(df.select_dtypes(include=['object']).columns)
nums = list(df.select_dtypes(exclude=['object']).columns)

nums.remove('Male') # Male fits better as a categorical variable.
nums.remove('Clicked on Ad') # This is the label.
nums.remove('Line_Words') # This is the label.
nums.remove('Line_Length') # This is the label.
nums.remove('Age') # This is a label.
nums.remove('Datetime') # This itself won't give any useful information, which is why we extracted certain information.

cats.remove('Ad Topic Line') # This doesn't really give insights in terms of data science or data visualization.
cats.remove('Timestamp') # This is a unique value so this wouldn't give any insights either.
cats.remove('City') # This has a unique value for almost every row, so it won't give any new information.
cats.extend(['Line_Words', 'Line_Length', 'Age'])

In [505]:
print(f'nums: {nums}')
print(f'cats: {cats}')

nums: ['Daily Time Spent on Site', 'Area Income', 'Daily Internet Usage']
cats: ['Country', 'Gender Labeled', 'Month', 'AM/PM', 'Hour', 'Day Time', 'Line_Words', 'Line_Length', 'Age']


In [506]:
feature_analysis_cols = cats + nums

X = df[feature_analysis_cols]
y = df['Clicked on Ad']

In [507]:
X.head()

Unnamed: 0,Country,Gender Labeled,Month,AM/PM,Hour,Day Time,Line_Words,Line_Length,Age,Daily Time Spent on Site,Area Income,Daily Internet Usage
0,Tunisia,Female,March,AM,0,night,3,34,35,68.95,61833.9,256.09
1,Nauru,Male,April,AM,1,night,3,34,31,80.23,68441.85,193.77
2,San Marino,Female,March,PM,20,night,3,32,26,69.47,59785.94,236.5
3,Italy,Male,January,AM,2,night,3,37,29,74.15,54806.18,245.89
4,Iceland,Female,June,AM,3,night,3,29,35,68.37,73889.99,225.58


In [508]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Clicked on Ad, dtype: int64

#### * The label, "Clicked on Ad", is a categorical variable, which is something we need to note.

# Comparing Numerical Features with Label

### ANOVA 

In [518]:
data = []
cols = ['Features', 'F', 'P', 'Rejected']

In [519]:
for feature in nums:
    f_statistic, p = stats.f_oneway(X[feature], y)

    row = [feature, round(f_statistic, 4), round(p, 4), p < ALPHA]
    data.append(row)

In [520]:
chi_table = pd.DataFrame(data=data, columns=cols)
chi_table

Unnamed: 0,Features,F,P,Rejected
0,Daily Time Spent on Site,16536.1092,0.0,True
1,Area Income,16809.6947,0.0,True
2,Daily Internet Usage,16714.6652,0.0,True


# Comparing Categorical Features with Label

### Chi Squared Test

In [521]:
data = []
cols = ['Features', 'P', 'DOF', 'Chi2', 'Rejected']

In [522]:
for feature in cats: 
    cross_tab = pd.crosstab(df[feature], y).values
    chi2, p, dof, expected_values = stats.chi2_contingency(cross_tab)
    row = [feature, round(p, 4), dof, round(chi2, 4), p < ALPHA]
    data.append(row)

In [523]:
chi_table = pd.DataFrame(data=data, columns=cols)
chi_table

Unnamed: 0,Features,P,DOF,Chi2,Rejected
0,Country,0.7738,236,219.4127,False
1,Gender Labeled,0.2546,1,1.2979,False
2,Month,0.9173,6,2.0261,False
3,AM/PM,0.1641,1,1.9361,False
4,Hour,0.8869,23,15.2104,False
5,Day Time,0.3001,3,3.664,False
6,Line_Words,0.3054,3,3.6208,False
7,Line_Length,0.5386,34,32.5514,False
8,Age,0.0,42,276.5378,True
