# Lab | Inferential statistics - T-test & P-value


### 1

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = 'machine_new.txt'

# Open the file in read mode and read line by line
with open(file_path, 'r') as file:
    for line in file:
        # Process each line here
        print(line.strip())  # Using strip() to remove leading/trailing whitespaces


New machine	    Old machine
42.1	        42.7
41	            43.6
41.3	        43.8
41.8	        43.3
42.4	        42.5
42.8	        43.5
43.2	        43.1
42.3	        41.7
41.8	        44
42.7	        44.1


In [3]:


file_path = 'machine_new.txt'

# Read the data from the text file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=0, names=['New machine', 'Old machine'])

# Display the DataFrame
display(df)


Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


$$H0: \mu_{a} = \mu_{a} → \mu_{N} - \mu_{O} = 0$$
$$H1: \mu_{b} \neq \mu_{b} → \mu_{N} - \mu_{O} \neq 0$$

Given the unequality on the alternative, we need to use a "two-sided" test.

In [5]:
from scipy import stats

# Perform one-tailed t-test
t_stat, p_value = stats.ttest_ind(df['New machine'], df['Old machine'], alternative='less')

# Define the significance level (alpha) and adjust for one-tailed test
alpha = 0.05
alpha_adjusted = alpha / 2

print("P_value is ", round(p_value, 5))
# Check if the p-value is less than the adjusted significance level
if p_value < alpha_adjusted:
    print("There is sufficient evidence to show that the new machine packs faster on average than the old machine.")
else:
    print("There is not sufficient evidence to show that the new machine packs faster on average than the old machine.")

P_value is  0.00161
There is sufficient evidence to show that the new machine packs faster on average than the old machine.


#### Another way to do

In [16]:
sample_a_mean, sample_b_mean = df['New machine'].mean(), df['Old machine'].mean()
sample_a_std, sample_b_std = df['New machine'].std(ddof=1), df['Old machine'].std(ddof=1)
sp = ( len(df['New machine']) - 1 ) * ( sample_a_std**2 ) +  ( len(df['Old machine']) - 1 ) * ( sample_b_std**2 )
sp /= ( len(df['New machine']) + len(df['Old machine']) - 2)
sp = np.sqrt(sp)
r = np.sqrt( (1/len(df['New machine'])) + (1/len(df['Old machine'])) )
t = ( sample_a_mean - sample_b_mean )/ (sp * r)

print("The mean of sample a is {:.2f}".format(sample_a_mean))
print("The mean of sample b is {:.2f}".format(sample_b_mean))
print("The standard deviation of sample a is {:.2f}".format(sample_a_std))
print("The standard deviation of sample b is {:.2f}".format(sample_b_std))
print("The ratio of the sample variances is {:.2f} which is bigger than 0.5 and smaller than 2".format(sample_a_std/sample_b_std))
print("The t statistic is: {:.2f}".format(t))

The mean of sample a is 42.14
The mean of sample b is 43.23
The standard deviation of sample a is 0.68
The standard deviation of sample b is 0.75
The ratio of the sample variances is 0.91 which is bigger than 0.5 and smaller than 2
The t statistic is: -3.40


In [18]:
st.t.cdf(t,df = len(df['New machine'])+len(df['Old machine'])-2)

0.0016055712503872579

### 2

In [8]:
data2 = pd.read_csv('pokemon.csv')
data2.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


b = 'defence',
a = 'attack'

$$H0: \mu_{b} = \mu_{a} → \mu_{b} - \mu_{a} = 0$$
$$H1: \mu_{b} \neq \mu_{a} → \mu_{b} - \mu_{a} \neq 0$$

In [11]:
sample =data2.sample(30, random_state = 1)
sample['difference'] = sample['Attack']-sample['Defense']
sample.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,difference
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False,26
510,460,Abomasnow,Grass,Ice,494,90,92,75,92,85,60,4,False,17
175,161,Sentret,Normal,,215,35,46,34,35,45,20,2,False,12
735,667,Litleo,Fire,Normal,369,62,50,58,73,54,72,6,False,-8
242,224,Octillery,Water,,480,75,105,75,105,75,45,2,False,30


In [12]:
sample_diff_mean, sample_diff_std = sample['difference'].mean(), sample['difference'].std(ddof=1)
sample_diff_mean, sample_diff_std

(2.8, 36.32079294288603)

In [13]:
t = sample_diff_mean / ( sample_diff_std / np.sqrt(sample.shape[0]) )
print("The mean of our samples differences is: {:.2f}".format(sample_diff_mean))
print("The standard deviation of our samples differences is: {:.2f}".format(sample_diff_std))
print("Our t statistics is: {:.2f}".format(t))

The mean of our samples differences is: 2.80
The standard deviation of our samples differences is: 36.32
Our t statistics is: 0.42


In [14]:
tc = st.t.ppf(1-(0.05/2),df= sample.shape[0] - 1)
tc

2.045229642132703

#### Our statistic is 0.42 while the critical value is 2.045. Then, as 0.42 < 2.045 we accept the H0.

With p_values.

In [15]:
1-st.t.cdf(t,df = sample.shape[0] - 1)

0.3379805871158601

he probability to see a t value as big as this one is for the t-Student distribution is:

$$0.3379805871158601 > 0.025$$

    we accept the H0. Therefore, the mean of 'attack' is the same as the mean of 'defence'

### Inferential statistics - ANOVA


### part1 

・State the null hypothesis: the mean of etching rate by each power is the same,

・State the alternate hypothesis: the mean of etching rate by each power is not the same,

・What is the significance level: 0.05

・What are the degrees of freedom of the model, error terms, and total DoF:

degrees of freedom of the model: 2
error terms:13
total DoF:14

### Part2

In [18]:
excel_file_path = 'anova_lab_data.xlsx'

# Read the data from the Excel file into a DataFrame
df = pd.read_excel(excel_file_path)

df

Unnamed: 0,Power,Etching Rate
0,160 W,5.43
1,180 W,6.24
2,200 W,8.79
3,160 W,5.71
4,180 W,6.71
5,200 W,9.2
6,160 W,6.22
7,180 W,5.98
8,200 W,7.9
9,160 W,6.01


In [24]:
print(df.columns)


Index(['Power ', 'Etching Rate'], dtype='object')


In [25]:
group_df = df.groupby('Power ')['Etching Rate'].agg(Power_mean='mean',Samples='size').reset_index()
group_df

Unnamed: 0,Power,Power_mean,Samples
0,160 W,5.792,5
1,180 W,6.238,5
2,200 W,8.318,5


In [27]:
# In our case, the groups are the cities, and the "y" is the "Rate"
S2t = 0
for power in df['Power '].unique():
    ng = len(df[df['Power '] == power])
    S2t  += ng * ( ( df[df['Power '] == power]['Etching Rate'].mean() - df['Etching Rate'].mean() ) ** 2)
S2t /= ( df['Power '].nunique() - 1 ) 

print("The value of S2t is {:.2f}".format(S2t))

The value of S2t is 9.09


In [31]:
S2E = 0
for power in df['Power '].unique():
    for rate in df[df['Power '] == power]['Etching Rate']:
        S2E += ( rate - df[df['Power '] == power]['Etching Rate'].mean() ) ** 2
S2E /= ( len(df) - df['Power '].nunique() )

print()
print("The value of S2E is {:.2f}".format(S2E))


The value of S2E is 0.25


In [32]:
F = S2t / S2E
print("The value of F is {:.2f}".format(F))

The value of F is 36.88


In [33]:
d1 = df['Power '].nunique() - 1
d2 = len(df) - df['Power '].nunique()

print("Number of degrees of freedom d1: ",d1)
print("Number of degrees of freedom d2: ",d2)

Number of degrees of freedom d1:  2
Number of degrees of freedom d2:  12


In [34]:
st.f.cdf(F,dfn=d1, dfd=d2)

0.9999924934157276

Thus, the probability to get any value smaller or equal to F

$$P(x \le F=36.88)= 0.9988$$

The opposite is given by

In [39]:
1 - st.f.cdf(F,dfn=d1, dfd=d2)

7.5065842723986975e-06

Therefore, the probability to get a value bigger than F is:

$$P(x > F) = 1 - P(x \le F) = 7.5065842723986975e-06 < 0.05$$

Therefore, we reject the H0

#### Conclusion: The mean of etching rate by each power is not the same,