### Import Data

In [1]:
pip install termcolor

Note: you may need to restart the kernel to use updated packages.


In [2]:
from termcolor import colored, cprint
import itertools
import numpy as np, statsmodels.stats.api as sms
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import matplotlib.ticker as ticker
from sklearn import preprocessing
from numpy import random

In [3]:
df = pd.read_csv('exp_data.csv')

In [4]:
df.head()

Unnamed: 0,Uin,gender,age,device,has_interest_online,interestss_TVShows,interests_Travel,interests_Society,interests_Pets,interests_Natural,...,interests_Fashion,interests_Tech,interests_Entertainment,interests_Health,interests_Cartoon,interests_Finance,interests_Realestate,interests_Videogames,interests_Art,Group
0,137f94de7d47cb5bbb2bf3265558f5b8,2,41,2,1,0.796494,0.0,4.110483,0.0,0.0,...,0.0,0.0,0.0,2.261562,0.0,0.0,0.0,0.0,0.0,0
1,337d0ac1a7b86b8acfb107490cfcc37b,2,27,14,1,38.714942,0.0,369.54839,0.0,0.0,...,62.609615,120.487872,189.237181,291.606592,0.0,1125.921014,0.0,0.0,0.0,0
2,e7065ba7a831950c4711dd3fd3904ffe,2,37,2,1,0.0,0.0,6.930192,0.0,0.0,...,0.0,0.0,1.448855,5.62025,0.0,1.617699,0.0,0.0,1.293002,1
3,36498084c61db712ad18e0c6def8579e,1,33,1,1,3.179579,0.0,455.546589,3.614626,0.0,...,0.0,0.0,0.0,158.125969,0.0,0.0,0.0,0.0,0.0,2
4,400dac9d50c6ca15e26d67590a758426,2,62,0,0,0.0,0.0,5.914961,0.0,0.0,...,0.0,0.0,0.0,1.678389,0.0,0.0,1.146208,0.0,0.492551,0


In [5]:
df.columns

Index(['Uin', 'gender', 'age', 'device', 'has_interest_online',
       'interestss_TVShows', 'interests_Travel', 'interests_Society',
       'interests_Pets', 'interests_Natural', 'interests_Cars',
       'interests_Foods', 'interests_Music', 'interests_Digital',
       'interests_Life', 'interests_Sports', 'interests_Reading',
       'interests_Childproducts', 'interests_Fashion', 'interests_Tech',
       'interests_Entertainment', 'interests_Health', 'interests_Cartoon',
       'interests_Finance', 'interests_Realestate', 'interests_Videogames',
       'interests_Art', 'Group'],
      dtype='object')

In [6]:
df.shape

(10000, 28)

### Compare Age between Group 0 vs. 1

In [6]:
age_0 = df[df['Group'] == 0]['age']
age_1 = df[df['Group'] == 1]['age']

In [7]:
cm_age = sms.CompareMeans(sms.DescrStatsW(age_0), sms.DescrStatsW(age_1))
print(cm_age.ttest_ind(alternative='two-sided', usevar='pooled'))

(-0.2258780702654209, 0.8213030931691397, 6750.0)


### Multiple Testing with t tests

In [8]:
def multi_cm(x):
    x0 = df[df['Group'] == 0][x]
    x1 = df[df['Group'] == 1][x]
    x2 = df[df['Group'] == 2][x]
    cm01 = sms.CompareMeans(sms.DescrStatsW(x0), sms.DescrStatsW(x1))
    cm02 = sms.CompareMeans(sms.DescrStatsW(x0), sms.DescrStatsW(x2))
    cm12 = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2))
    cprint(x,'red', 'on_yellow')
    print(cm01.ttest_ind(alternative='two-sided', usevar='pooled'))  
    print(cm02.ttest_ind(alternative='two-sided', usevar='pooled')) 
    print(cm12.ttest_ind(alternative='two-sided', usevar='pooled')) 

In [9]:
var = df.columns
for i in range(14):
    multi_cm(var[i+1])

[43m[31mgender[0m
(1.338773515763321, 0.18068950041259244, 6750.0)
(0.9053695111401618, 0.36530258109856395, 6626.0)
(-0.41833581824238186, 0.6757151790218867, 6618.0)
[43m[31mage[0m
(-0.2258780702654209, 0.8213030931691397, 6750.0)
(0.9497330944934246, 0.3422825277496915, 6626.0)
(1.1687121493699624, 0.24256172111339383, 6618.0)
[43m[31mdevice[0m
(0.10889071847020237, 0.9132924051645153, 6750.0)
(0.5134641434648958, 0.6076438065780314, 6626.0)
(0.41388040743233595, 0.6789751051271262, 6618.0)
[43m[31mhas_interest_online[0m
(-1.2845094396946113, 0.19900784380497932, 6750.0)
(-1.6808531965006905, 0.0928385650413649, 6626.0)
(-0.40792621277474284, 0.6833410435536809, 6618.0)
[43m[31minterestss_TVShows[0m
(-0.5234272052816967, 0.6006941789213049, 6750.0)
(0.23350129033532566, 0.8153793808681323, 6626.0)
(0.6736860794427328, 0.5005344408716177, 6618.0)
[43m[31minterests_Travel[0m
(0.06482087845276627, 0.9483185373224183, 6750.0)
(-0.5013152001110853, 0.6161659636359065, 6

### Multiple Testing with CIs

In [10]:
def multi_cm(x):
    x0 = df[df['Group'] == 0][x]
    x1 = df[df['Group'] == 1][x]
    x2 = df[df['Group'] == 2][x]
    cm01 = sms.CompareMeans(sms.DescrStatsW(x0), sms.DescrStatsW(x1))
    cm02 = sms.CompareMeans(sms.DescrStatsW(x0), sms.DescrStatsW(x2))
    cm12 = sms.CompareMeans(sms.DescrStatsW(x1), sms.DescrStatsW(x2))
    cprint(x,'red', 'on_yellow')
    print(cm01.zconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled'))
    print(cm02.zconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled'))
    print(cm12.zconfint_diff(alpha=0.05, alternative='two-sided', usevar='pooled'))

In [11]:
for i in range(14):
    multi_cm(var[i+1])

[43m[31mgender[0m
(-0.00755761848672714, 0.04013358344703349)
(-0.012993350685755072, 0.03530293765051445)
(-0.029182923978018634, 0.018916545982471663)
[43m[31mage[0m
(-0.7642663066007759, 0.6063125348501205)
(-0.3598776790635133, 1.0365304029973976)
(-0.28252633538176, 1.1171328310662996)
[43m[31mdevice[0m
(-0.0779825235473426, 0.08715727980668687)
(-0.06048222514523573, 0.1034209839991136)
(-0.06306407475614212, 0.09682807735067572)
[43m[31mhas_interest_online[0m
(-0.03812695208476372, 0.007937504703966848)
(-0.04324119211221372, 0.0033149379949782252)
(-0.028259631892169006, 0.018522825155730387)
[43m[31minterestss_TVShows[0m
(-2.2203402873774665, 1.2843729567695452)
(-1.4700744002895791, 1.867724775833508)
(-1.2731471241684726, 2.606764830320323)
[43m[31minterests_Travel[0m
(-0.9902440329339546, 1.0579840235354163)
(-1.4302725552067663, 0.8476345702936463)
(-1.5066922381941543, 0.8563142626795726)
[43m[31minterests_Society[0m
(-18.002333285345074, 8.6799743278