In [8]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
import math

In [9]:
df = pd.read_csv('../data/train.csv')

## 95% Confidence Interval for Mean CPM of the population 

In [10]:
np.random.seed(100)

sample_size = 10000
sample = np.random.choice(df['CPM'], size = sample_size)
sample_mean = sample.mean()

#t_critical = stats.t.ppf(q = 0.975, df = 24)
z_critical = stats.norm.ppf(q = 0.975)
print("z_critical: {}".format(z_critical))
sample_stddev = sample.std(ddof = 1)
std_err = sample_stddev/math.sqrt(sample_size)
margin_of_error = z_critical * std_err

confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
print("confidence interval:{} ".format(confidence_interval))

z_critical: 1.959963984540054
confidence interval:(2.015394037678109, 2.2634866987696025) 


In [11]:
# confidence intval with z distribution
stats.norm.interval(alpha = 0.95, loc = sample_mean, scale = std_err)

(2.015394037678109, 2.2634866987696025)

In [12]:
df['CPM'].mean()

2.1001980853085906

In [13]:
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Are the mean Browser CPMs different from one another?
    H0: The means of the distribution of CPM of each browser are equal
    Ha: The means are different

#### Compare the mean CPM of "Opera" and "Chrome"

In [26]:
# Testing to prove that the the mean CPM of one browser is not equal mean CPM of another browser
# Take random samples of any 2 Browsers and perform Independent Sample Test.
from scipy.stats import ttest_ind_from_stats
np.random.seed(100)

browser_1 = 'Chrome'
browser_2 = 'Opera'

size_1 = 100
sample_1 = np.random.choice(df[df['Browser']==browser_1]['CPM'], size_1)
stddev_1 = sample_1.std()
x1_bar = sample_1.mean()

size_2 = 100
sample_2 = np.random.choice(df[df['Browser']==browser_2]['CPM'], size_2)
stddev_2 = sample_2.std()
x2_bar = sample_2.mean()


z_statistic, p_value = ttest_ind_from_stats(x1_bar, stddev_1, size_1, 
                                            x2_bar, stddev_2, size_2, equal_var=False)

print("z_statistic: {:.3f}".format(z_statistic))
print("pvalue: {:.3f}".format(p_value))

alpha = 0.05
if(p_value <= alpha):
    print("Reject H0")
else:
    print("Cannot Reject H0")

z_statistic: 0.240
pvalue: 0.811
Cannot Reject H0


    The mean CPM of "Chrome" and "Opera" are statistically similar

#### Compare the mean CPM of "Firefox" and "Chrome"

In [27]:
# Testing to prove that the the mean CPM of one browser is not equal mean CPM of another browser
# Take random samples of any 2 Browsers and perform Independent Sample Test.
from scipy.stats import ttest_ind_from_stats
np.random.seed(100)

browser_1 = 'Chrome'
browser_2 = 'Firefox'

size_1 = 100
sample_1 = np.random.choice(df[df['Browser']==browser_1]['CPM'], size_1)
stddev_1 = sample_1.std()
x1_bar = sample_1.mean()

size_2 = 100
sample_2 = np.random.choice(df[df['Browser']==browser_2]['CPM'], size_2)
stddev_2 = sample_2.std()
x2_bar = sample_2.mean()


z_statistic, p_value = ttest_ind_from_stats(x1_bar, stddev_1, size_1, 
                                            x2_bar, stddev_2, size_2, equal_var=False)

print("z_statistic: {:.3f}".format(z_statistic))
print("pvalue: {:.3f}".format(p_value))

alpha = 0.05
if(p_value < alpha):
    print("Reject H0")
else:
    print("Cannot Reject H0")

z_statistic: 1.556
pvalue: 0.122
Cannot Reject H0


    The mean CPM of "Firefox" and "Chrome" are statistically similar

#### Compare the mean CPM of "Chrome" and "Microsoft Edge"

In [28]:
# Testing to prove that the the mean CPM of one browser is not equal mean CPM of another browser
# Take random samples of any 2 Browsers and perform Independent Sample Test.
from scipy.stats import ttest_ind_from_stats
np.random.seed(100)

browser_1 = 'Chrome'
browser_2 = 'Microsoft Edge'

size_1 = 100
sample_1 = np.random.choice(df[df['Browser']==browser_1]['CPM'], size_1)
stddev_1 = sample_1.std()
x1_bar = sample_1.mean()

size_2 = 100
sample_2 = np.random.choice(df[df['Browser']==browser_2]['CPM'], size_2)
stddev_2 = sample_2.std()
x2_bar = sample_2.mean()


z_statistic, p_value = ttest_ind_from_stats(x1_bar, stddev_1, size_1, 
                                            x2_bar, stddev_2, size_2, equal_var=False)

print("z_statistic: {:.3f}".format(z_statistic))
print("pvalue: {:.3f}".format(p_value))

alpha = 0.05
if(p_value < alpha):
    print("Reject H0")
else:
    print("Cannot Reject H0")

z_statistic: 3.771
pvalue: 0.000
Reject H0


    The mean CPM of Microsoft Edge and Chrome are significantly different

## Analysing mean CPM of various device types

    H0: Mean CPM of TV is equal to mean CPM of Tablet.
    Ha: Mean CPM of TV is not different than mean CPM of Tablet.

    scipy always gives the two tail statistic as signed. This means that given p and t values 
    from a two-tailed test, you would 
    - Reject H0 of a lower tail test when p/2 < alpha and t > 0, 
    - Reject H0 of of Upper tail test when p/2 < alpha and t < 0.

In [38]:
df.groupby('Device Type', as_index = False).mean()[['Device Type','CPM']].sort_values('CPM',ascending = False)

Unnamed: 0,Device Type,CPM
2,Smart Phone,2.569455
3,Tablet,1.916848
1,Desktop,1.367791
0,Connected TV,0.776602


In [41]:
# Testing to see if the Mean CPM of Smart Phone and TV are different
np.random.seed(100)

device_1 = 'Connected TV'
device_2 = 'Smart Phone'

size_1 = 30
sample_1 = np.random.choice(df[df['Device Type'] == device_1]['CPM'], size_1)
xbar_1 = sample_1.mean()
stddev_1 = sample_1.std()

size_2 = 40
sample_1 = np.random.choice(df[df['Device Type'] == device_2]['CPM'], size_2)
xbar_2 = sample_2.mean()
stddev_2 = sample_2.std()

z_statistic, p_value = ttest_ind_from_stats(xbar_1, stddev_1, size_1,
                                            xbar_2, stddev_2, size_2,
                                           equal_var=False)

print("z_statistic: {}".format(z_statistic))
print("p_value: {}".format(p_value))

alpha = 0.05
if(p_value/2 < alpha):
    print("Reject H0")
else:
    print("Cannot Reject H0")

z_statistic: -1.533695656846883
p_value: 0.13186836937351673
Cannot Reject H0


    We cannot Reject H0: This means that statistically speaking the Mean CPM of SmartPhone is same as the mean CPM of TV.

## CHI-squared goodness of fit test

### Does the distribution of browsers in Desktops differ from the rest of the population
    H0: Distribution of App/URL ID in Desktops is the same as the rest of the population
    Ha: Distribution of App/URL ID in Desktops is NOT the same as the rest of the population

In [72]:
# Get expected and Observed distribution
import numpy as np
import pandas as pd
import scipy.stats as stats

population = df['App/URL ID']
desktop = df[df['Device Type'] == 'Desktop']['App/URL ID']

desktop_table = pd.DataFrame(desktop.value_counts())
population_table = pd.DataFrame(population.value_counts())
population_table = population_table.loc[desktop_table.index]

observed = desktop_table
population_ratios = pd.DataFrame(population_table/population_table.shape[0])
expected = population_ratios * len(desktop)


In [64]:
# chi-statistic formula
chi_squared_stat = (((observed-expected)**2)/expected).sum()
print(chi_squared_stat)

App/URL ID    3.196655e+07
dtype: float64


In [71]:
# get critical chi value and Reject Hypothesis
chi_crit = stats.chi2.ppf(q = 0.95, df = expected.shape[0] - 1)
print("Critical Value: {}".format(crit))

chi_statistic = stats.chisquare(f_obs=observed, f_exp=expected).statistic[0]
if(chi_statistic > chi_crit):
    print("Reject H0")
else:
    print("Cannot Reject H0")


Critical Value: 9618.566264974741
Reject H0


    Since the Chi-statistic is greated than the critical Chi value we can reject the H0 hypothesis that the observed and expected distribution of App/IDa are same

### Does the distribution of OS in Smartphones differ from the rest of the population
    H0: Distribution of Operating System in Smart Phones is the same as the rest of the population
    Ha: Distribution of Operating System in Smart Phones is NOT the same as the rest of the population

In [76]:
# Get expected and Observed distribution
import numpy as np
import pandas as pd
import scipy.stats as stats

population = df['Operating System']
phones = df[df['Device Type'] == 'Smart Phone']['Operating System']

phones_table = pd.DataFrame(phones.value_counts())
population_table = pd.DataFrame(population.value_counts())
population_table = population_table.loc[phones_table.index]

observed = phones_table
population_ratios = pd.DataFrame(population_table/population_table.shape[0])
expected = population_ratios * len(phones)


In [77]:
# chi-statistic formula
chi_squared_stat = (((observed-expected)**2)/expected).sum()
print(chi_squared_stat)

Operating System    1.277273e+10
dtype: float64


In [80]:
# get critical chi value and Reject Hypothesis
chi_crit = stats.chi2.ppf(q = 0.95, df = expected.shape[0] - 1)
print("Critical Value: {}".format(crit))

chi_statistic = stats.chisquare(f_obs=observed, f_exp=expected).statistic[0]
print("Chi-Statistic: {}".format(chi_statistic))

if(chi_statistic > chi_crit):
    print("Reject H0")
else:
    print("Cannot Reject H0")


Critical Value: 9618.566264974741
Chi-Statistic: 12772733650.000673
Reject H0


    We Reject Null hypothesis that the distribution of OSes in phones is same as the population

In [73]:
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022
