In [189]:
import pandas as pd
from scipy import stats

In [190]:
ALPHA = 0.05

In [191]:
df = pd.read_csv('advertising.csv')
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


## Quick Feature Additions

In [192]:
# Add the # of words in each ad topic line.

In [193]:
def word_count(string):
    return string.count(" ") + 1

In [194]:
df['Line_Words'] = df['Ad Topic Line'].apply(lambda row: word_count(row))

In [195]:
# Add the length of each ad topic line.

In [196]:
def line_length(string):
    return len(string)

In [197]:
df['Line_Length'] = df['Ad Topic Line'].apply(lambda row: line_length(row))

In [198]:
# Change the Male category to 'Male' and 'Female'

In [199]:
def gender_label_encode(label):
    if label == 0:
        return 'Female'
    else: 
        return 'Male'

In [200]:
df['Gender Labeled'] = df['Male'].apply(lambda row: gender_label_encode(row))

In [201]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad,Line_Words,Line_Length,Gender Labeled
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0,3,34,Female
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0,3,34,Male
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0,3,32,Female
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0,3,37,Male
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0,3,29,Female


In [202]:
df.nunique()

Daily Time Spent on Site     900
Age                           43
Area Income                 1000
Daily Internet Usage         966
Ad Topic Line               1000
City                         969
Male                           2
Country                      237
Timestamp                   1000
Clicked on Ad                  2
Line_Words                     4
Line_Length                   35
Gender Labeled                 2
dtype: int64

In [203]:
cats = list(df.select_dtypes(include=['object']).columns)
nums = list(df.select_dtypes(exclude=['object']).columns)

nums.remove('Male') # Male fits better as a categorical variable.
nums.remove('Clicked on Ad') # This is the label.
nums.remove('Line_Words') # This is the label.
nums.remove('Line_Length') # This is the label.

cats.remove('Ad Topic Line') # This doesn't really give insights in terms of data science or data visualization.
cats.remove('Timestamp') # This is a unique value so this wouldn't give any insights either.
cats.remove('City') # This has a unique value for almost every row, so it won't give any new information.
cats.extend(['Line_Words', 'Line_Length'])

In [204]:
print(f'nums: {nums}')
print(f'cats: {cats}')

nums: ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage']
cats: ['Country', 'Gender Labeled', 'Line_Words', 'Line_Length']


In [205]:
feature_analysis_cols = cats + nums

X = df[feature_analysis_cols]
y = df['Clicked on Ad']

In [206]:
X.head()

Unnamed: 0,Country,Gender Labeled,Line_Words,Line_Length,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage
0,Tunisia,Female,3,34,68.95,35,61833.9,256.09
1,Nauru,Male,3,34,80.23,31,68441.85,193.77
2,San Marino,Female,3,32,69.47,26,59785.94,236.5
3,Italy,Male,3,37,74.15,29,54806.18,245.89
4,Iceland,Female,3,29,68.37,35,73889.99,225.58


In [207]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Clicked on Ad, dtype: int64

#### * The label, "Clicked on Ad", is a categorical variable, which is something we need to note.

# Comparing Numerical Features with Label

### ANOVA 

In [208]:
for feature in nums:
    print(f'feature: {feature}')
    f_statistic, p = stats.f_oneway(X[feature], y)
    print(f'f statistic: {f_statistic}, p value: {p}')
    
    if p > ALPHA:
        print(f'Fail to reject the null hypothesis that {feature} and "clicked on ad" are independent.')
    else:
        print(f'Reject the null hypothesis that {feature} and "clicked on ad" are independent.')
    
    print('')

feature: Daily Time Spent on Site
f statistic: 16536.10915247728, p value: 0.0
Reject the null hypothesis that Daily Time Spent on Site and "clicked on ad" are independent.

feature: Age
f statistic: 16282.908398952704, p value: 0.0
Reject the null hypothesis that Age and "clicked on ad" are independent.

feature: Area Income
f statistic: 16809.694701464217, p value: 0.0
Reject the null hypothesis that Area Income and "clicked on ad" are independent.

feature: Daily Internet Usage
f statistic: 16714.66520118457, p value: 0.0
Reject the null hypothesis that Daily Internet Usage and "clicked on ad" are independent.



# Comparing Categorical Features with Label

### Chi Squared Test

In [209]:
for feature in cats: 
    cross_tab = pd.crosstab(df[feature], y).values
    chi2, p, dof, expected_values = stats.chi2_contingency(cross_tab)

    print(f'{feature}, chi2: {chi2}, p: {p}, dof: {dof}')
    
    if p > ALPHA:
        print(f'Fail to reject the null hypothesis that {feature} and "clicked on ad" are independent.')
    else:
        print(f'Reject the null hypothesis that {feature} and "clicked on ad" are independent.')
    
    print('')

Country, chi2: 219.41269841269843, p: 0.7737565395501932, dof: 236
Fail to reject the null hypothesis that Country and "clicked on ad" are independent.

Gender Labeled, chi2: 1.2978741302440726, p: 0.2546019035670961, dof: 1
Fail to reject the null hypothesis that Gender Labeled and "clicked on ad" are independent.

Line_Words, chi2: 3.620752010974621, p: 0.3054353647080947, dof: 3
Fail to reject the null hypothesis that Line_Words and "clicked on ad" are independent.

Line_Length, chi2: 32.55139536578924, p: 0.5386298375271157, dof: 34
Fail to reject the null hypothesis that Line_Length and "clicked on ad" are independent.



# Comparing Numerical Features with Each Other

### Correlation

In [210]:
X = df[nums]
X.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage
0,68.95,35,61833.9,256.09
1,80.23,31,68441.85,193.77
2,69.47,26,59785.94,236.5
3,74.15,29,54806.18,245.89
4,68.37,35,73889.99,225.58


In [211]:
X.corr()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage
Daily Time Spent on Site,1.0,-0.331513,0.310954,0.518658
Age,-0.331513,1.0,-0.182605,-0.367209
Area Income,0.310954,-0.182605,1.0,0.337496
Daily Internet Usage,0.518658,-0.367209,0.337496,1.0
