In [760]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime

In [761]:
ALPHA = 0.05

In [762]:
df = pd.read_csv('advertising.csv')
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [763]:
df.nunique()

Daily Time Spent on Site     900
Age                           43
Area Income                 1000
Daily Internet Usage         966
Ad Topic Line               1000
City                         969
Male                           2
Country                      237
Timestamp                   1000
Clicked on Ad                  2
dtype: int64

# First Moment Feature Additions

In [764]:
# Add the number of words in each ad topic line.

def word_count(string):
    return string.count(" ") + 1

df['Line_Words'] = df['Ad Topic Line'].apply(lambda row: word_count(row))

In [765]:
# Add the length of each ad topic line.

def line_length(string):
    return len(string)

df['Line_Length'] = df['Ad Topic Line'].apply(lambda row: line_length(row))

In [766]:
# Change the Male category to 'Male' and 'Female'

def gender_label_encode(label):
    if label == 0:
        return 'Female'
    else: 
        return 'Male'
    
df['Gender Labeled'] = df['Male'].apply(lambda row: gender_label_encode(row))

In [767]:
# Add a column for the datetime object version of the timestamp

def convert_datetime(time):
    return datetime.strptime(time, '%Y-%m-%d %H:%M:%S')

df['Datetime'] = df['Timestamp'].apply(lambda row: convert_datetime(row))

In [768]:
# Add the month column.

def extract_month(time): # There are only 7 months in this dataset.
    return time.strftime('%B')

df['Month'] = df['Datetime'].apply(lambda row: extract_month(row))

In [769]:
# Add the AM/PM column.

def extract_am_pm(time):
    return time.strftime('%p')

df['AM/PM'] = df['Datetime'].apply(lambda row: extract_am_pm(row))

In [770]:
# Add the hour column.

def extract_hour(time):
    return time.strftime('%H')

df['Hour'] = df['Datetime'].apply(lambda row: extract_hour(row))

In [771]:
# Add the day time column.

def day_time(hour):
    hour = int(hour)
    
    if hour > 5 and hour <= 12:
        return 'morning'
    elif hour > 12 and hour <= 16:
        return 'afternoon'
    elif hour > 16 and hour <= 18:
        return 'evening'
    else:
        return 'night'

df['Day Time'] = df['Hour'].apply(lambda row: day_time(row))

# Additional Feature Additions

In [772]:
df['Length/Site Time'] = df['Line_Length'] / df['Daily Time Spent on Site']
df['Length/Age'] = df['Line_Length'] / df['Age']
df['Length/Word'] = df['Line_Length'] / df['Line_Words']

df['Site Time/Internet Time'] = df['Daily Time Spent on Site'] / df['Daily Internet Usage']

df['Age/Site Time'] = df['Age'] / df['Daily Time Spent on Site']

df['Income/Site Time'] = df['Area Income'] / df['Daily Time Spent on Site']
df['Income/Length'] = df['Area Income'] / df['Line_Length']

<br>

<li>Length / Site Time - This category returns the length of the ad topic line based on how long the customer spends on the website. This could yield a helpful pattern where possibly, the longer one is on the site, the shorter the ad topic line(or the longer) which may yield better results on clicking the ad. 
<li>Length / Age - I think this category could be of use because there may be a relationship between the length of the ad topic line and the age of the consumer. The model may see that higher characters per age tend to result in more succcesses or vice versa for the ad being clicked but either way, we can see if this is statistically significant with whether or not the ad is clicked.

<li>Length / Word - This category yields the number of characters per word in the ad topic line. This might be useful because the model could see a pattern where possibly shorter characters per word yield better results or vice versa. 

<li>Site Time / Internet Time - This feature may be useful because the model may see a pattern between a customer spending more time on the website when they are on the internet and whether or not they click on the ad. Possibly, the higher that ratio is, the more likely one is to click on the ad or vice versa, perhaps if they are curious about what this site is about when they first see an ad.
    
<li>Age / Site Time - This feature may see a pattern between age and the amount of time on the site. We did see in the visualization analysis that the older population spent less time on the internet and website and tends to click on the ad more than the younger population so this feature of dividing the age by the amount of time spent on the site may bring out a pattern that the model may notice.

<li>Income / Site Time - This feauture may bring out a pattern between the income and time spent on the site which could yield better results in the model. Possibly, the higher incomes may yield higher site times which may lead to more people clicking on the ad or possibly the opposite. Since we don't know the product, we can make a final conclusion but this feature could add another aspect to the training data.

<li>Income/Length - This category may be helpful because it may bring out a pattern between income and how long the ad is in characters and words. Possibly the higher income click on ads with less characters/words or vice versa. They are features could be useful.

<br>

In [773]:
df.columns

Index(['Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'City', 'Male', 'Country',
       'Timestamp', 'Clicked on Ad', 'Line_Words', 'Line_Length',
       'Gender Labeled', 'Datetime', 'Month', 'AM/PM', 'Hour', 'Day Time',
       'Length/Site Time', 'Length/Age', 'Length/Word',
       'Site Time/Internet Time', 'Age/Site Time', 'Income/Site Time',
       'Income/Length'],
      dtype='object')

In [774]:
df.nunique()

Daily Time Spent on Site     900
Age                           43
Area Income                 1000
Daily Internet Usage         966
Ad Topic Line               1000
City                         969
Male                           2
Country                      237
Timestamp                   1000
Clicked on Ad                  2
Line_Words                     4
Line_Length                   35
Gender Labeled                 2
Datetime                    1000
Month                          7
AM/PM                          2
Hour                          24
Day Time                       4
Length/Site Time             991
Length/Age                   463
Length/Word                   59
Site Time/Internet Time     1000
Age/Site Time                997
Income/Site Time            1000
Income/Length               1000
dtype: int64

In [775]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,...,AM/PM,Hour,Day Time,Length/Site Time,Length/Age,Length/Word,Site Time/Internet Time,Age/Site Time,Income/Site Time,Income/Length
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0,...,AM,0,night,0.493111,0.971429,11.333333,0.269241,0.507614,896.793328,1818.644118
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0,...,AM,1,night,0.423782,1.096774,11.333333,0.414048,0.386389,853.070547,2012.995588
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0,...,PM,20,night,0.46063,1.230769,10.666667,0.293742,0.374262,860.600835,1868.310625
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0,...,AM,2,night,0.498989,1.275862,12.333333,0.301558,0.391099,739.125826,1481.248108
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0,...,AM,3,night,0.424163,0.828571,9.666667,0.303085,0.51192,1080.737019,2547.93069


In [776]:
cats = list(df.select_dtypes(include=['object']).columns)
nums = list(df.select_dtypes(exclude=['object']).columns)

nums.remove('Male') # Male fits better as a categorical variable.
nums.remove('Clicked on Ad') # This is the label.
nums.remove('Line_Words') # This is the label.
nums.remove('Line_Length') # This is the label.
nums.remove('Age') # This is a label.
nums.remove('Datetime') # This itself won't give any useful information, which is why we extracted certain information.

cats.remove('Ad Topic Line') # This doesn't really give insights in terms of data science or data visualization.
cats.remove('Timestamp') # This is a unique value so this wouldn't give any insights either.
cats.remove('City') # This has a unique value for almost every row, so it won't give any new information.
cats.extend(['Line_Words', 'Line_Length', 'Age'])

In [777]:
print(f'nums: {nums}')
print(f'cats: {cats}')

nums: ['Daily Time Spent on Site', 'Area Income', 'Daily Internet Usage', 'Length/Site Time', 'Length/Age', 'Length/Word', 'Site Time/Internet Time', 'Age/Site Time', 'Income/Site Time', 'Income/Length']
cats: ['Country', 'Gender Labeled', 'Month', 'AM/PM', 'Hour', 'Day Time', 'Line_Words', 'Line_Length', 'Age']


In [778]:
feature_analysis_cols = cats + nums

X = df[feature_analysis_cols]
y = df['Clicked on Ad']

In [779]:
X.head()

Unnamed: 0,Country,Gender Labeled,Month,AM/PM,Hour,Day Time,Line_Words,Line_Length,Age,Daily Time Spent on Site,Area Income,Daily Internet Usage,Length/Site Time,Length/Age,Length/Word,Site Time/Internet Time,Age/Site Time,Income/Site Time,Income/Length
0,Tunisia,Female,March,AM,0,night,3,34,35,68.95,61833.9,256.09,0.493111,0.971429,11.333333,0.269241,0.507614,896.793328,1818.644118
1,Nauru,Male,April,AM,1,night,3,34,31,80.23,68441.85,193.77,0.423782,1.096774,11.333333,0.414048,0.386389,853.070547,2012.995588
2,San Marino,Female,March,PM,20,night,3,32,26,69.47,59785.94,236.5,0.46063,1.230769,10.666667,0.293742,0.374262,860.600835,1868.310625
3,Italy,Male,January,AM,2,night,3,37,29,74.15,54806.18,245.89,0.498989,1.275862,12.333333,0.301558,0.391099,739.125826,1481.248108
4,Iceland,Female,June,AM,3,night,3,29,35,68.37,73889.99,225.58,0.424163,0.828571,9.666667,0.303085,0.51192,1080.737019,2547.93069


In [780]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Clicked on Ad, dtype: int64

#### * The label, "Clicked on Ad", is a categorical variable, which is something we need to note.

# Comparing Numerical Features with Label

### ANOVA 

In [781]:
data = []
cols = ['Features', 'F', 'P', 'Rejected']

In [782]:
def anova_helper(df, num_col, cat_col='Clicked on Ad'):
    uniques = df[cat_col].unique()
    data = []
    for col in uniques:
        cur_df = (df[cat_col] == col)
        cur_df = df[cur_df][num_col]
        data.append(cur_df.tolist())
        
    f, p = stats.f_oneway(*data)
    return f, p

In [783]:
for feature in nums:
    f_statistic, p = anova_helper(df, feature)
    row = [feature, f_statistic, p, p < ALPHA]
    data.append(row)

In [784]:
anova_table = pd.DataFrame(data=data, columns=cols)
anova_table

Unnamed: 0,Features,F,P,Rejected
0,Daily Time Spent on Site,1268.525161,5.877384e-180,True
1,Area Income,292.770617,9.449583e-58,True
2,Daily Internet Usage,1618.976456,3.893234e-211,True
3,Length/Site Time,520.966607,4.049322e-93,True
4,Length/Age,137.086299,9.223705e-30,True
5,Length/Word,2.237916,0.1349785,False
6,Site Time/Internet Time,9.4433,0.002176732,True
7,Age/Site Time,1028.235673,1.1939759999999999e-155,True
8,Income/Site Time,82.385634,5.8389869999999995e-19,True
9,Income/Length,205.1101,1.897126e-42,True


# Comparing Categorical Features with Label

### Chi Squared Test

In [785]:
data = []
cols = ['Features', 'P', 'DOF', 'Chi2', 'Rejected']

In [786]:
for feature in cats: 
    cross_tab = pd.crosstab(df[feature], y).values
    chi2, p, dof, expected_values = stats.chi2_contingency(cross_tab)
    row = [feature, p, dof, chi2, p < ALPHA]
    data.append(row)

In [787]:
chi_table = pd.DataFrame(data=data, columns=cols)
chi_table

Unnamed: 0,Features,P,DOF,Chi2,Rejected
0,Country,0.7737565,236,219.412698,False
1,Gender Labeled,0.2546019,1,1.297874,False
2,Month,0.9172817,6,2.02611,False
3,AM/PM,0.1640959,1,1.93607,False
4,Hour,0.8869351,23,15.210383,False
5,Day Time,0.3001117,3,3.663957,False
6,Line_Words,0.3054354,3,3.620752,False
7,Line_Length,0.5386298,34,32.551395,False
8,Age,2.7937559999999998e-36,42,276.537811,True


<h5>These are the original statistically significant feature that will be in one dataset.</h5>
<ul>   
    <li>Daily Time Spent on Site
    <li>Area Income	
    <li>Daily Internet Usage
    <li>Age            
</ul>   

<h5>These are the statistically significant features that will include the additional features that will created in a separate csv data file.</h5>
<ul>   
    <li>Daily Time Spent on Site
    <li>Area Income	
    <li>Daily Internet Usage
    <li>Age    
    <li>Length/Site Time
    <li>Length/Age
	<li>Site Time/Internet Time
	<li>Age/Site Time	
	<li>Income/Site Time
    <li>Income/Length	
</ul>    

<h4>* With these two datasets, I am going to run each Machine Learning Model on both of these datasets, and compare them use z/t tests and other measurements and in the end, find the most accurate model.