In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np

In [2]:
df = pd.read_csv("clean_dataset.csv")

In [3]:
#H0: The mean of the average down latency == The mean of the derived average down latency  (Meaning the derived column is acceptable data)
#H1: The mean of the average down latency != The mean of the derived average down latency  (The derived data is not accurate enough to be used)

x = df['avg_lat_down_ms']
y = df['derived_lat_down']


z_stat = (x.mean() - y.mean()) / np.sqrt((x.std()**2 / len(x)) + (y.std()**2 / len(y)))

p_val = 2 * (1 - stats.norm.cdf(abs(z_stat)))

print(f'Z statistic: %.5f' %z_stat)
print(f'P value: %.50f' %p_val)

if p_val > 0.05:
    print('Accept Null Hypothesis')
else:
    print('Reject Null Hypothesis')

Z statistic: -1.25885
P value: 0.20808422624147770996216877392726019024848937988281
Accept Null Hypothesis


In [4]:
df = df.reindex(columns = df.columns.tolist() + ['cat_d_kbps']) #categorical down kbps

for i in df.index:

    if df.loc[i, 'avg_d_kbps'] < df['avg_d_kbps'].quantile(1/3):
        df.loc[i, 'cat_d_kbps'] = 'low'

    elif df.loc[i, 'avg_d_kbps'] < df['avg_d_kbps'].quantile(2/3):
        df.loc[i, 'cat_d_kbps'] = 'medium'
    
    else:
        df.loc[i, 'cat_d_kbps'] = 'High'

df['cat_d_kbps'] = pd.Categorical(df['cat_d_kbps'])

  df.loc[i, 'cat_d_kbps'] = 'low'


In [5]:
df.head(10)

Unnamed: 0,ID,avg_d_kbps,avg_u_kbps,avg_lat_ms,avg_lat_down_ms,avg_lat_up_ms,net_type,derived_lat_down,derived_lat_up,cat_d_kbps
0,0,9711,1489,21,208.0,,mobile,208.0,656.0,low
1,1,50073,18199,40,475.0,1954.0,mobile,475.0,1954.0,medium
2,2,21784,745,47,1493.0,2252.0,mobile,1493.0,2252.0,low
3,3,18159,1662,21,244.0,2067.0,mobile,244.0,2067.0,low
4,5,13498,3525,37,598.0,1023.0,mobile,598.0,1023.0,low
5,6,181395,3546,19,489.0,873.0,mobile,489.0,873.0,High
6,7,214,117,18,22.0,32.0,mobile,22.0,32.0,low
7,8,31785,8471,50,,,mobile,22.0,32.0,medium
8,9,12451,1797,30,,,mobile,22.0,32.0,low
9,10,22508,12590,29,469.0,1710.0,mobile,469.0,1710.0,low


In [6]:
#H0: There is no association between net_type and cat_d_kbps
#H1: There is an association between net_type and cat_d_kbps

contingency = pd.crosstab(df['net_type'], df['cat_d_kbps'])

chi2, p, _, _ = stats.chi2_contingency(contingency)

print(f"Chi-Square Statistic: %.5f" %chi2)
print(f"P-value: %.25f" %p)

if p > 0.05:
    print('Accept Null Hypothesis')
else:
    print('Reject Null Hypothesis')

Chi-Square Statistic: 2239.51803
P-value: 0.0000000000000000000000000
Reject Null Hypothesis
