# 3.1 Statistics in Python

# 3.1.1 Data representation and interaction

## libraries and setup

In [1]:
from scipy.stats import linregress
import numpy as np
import matplotlib.pyplot as plt
import pandas
import os

In [2]:
os.getcwd()

'/users/PAS2797/slenz/Assignment4-Stats-Scripts/Assignment4-Stats-Scripts'

# 3.1.1.2 Pandas df

In [3]:
data = pandas.read_csv('examples/brain_size.csv', sep=';', na_values=".")

In [4]:
data

Unnamed: 0.1,Unnamed: 0,Gender,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
0,1,Female,133,132,124,118.0,64.5,816932
1,2,Male,140,150,124,,72.5,1001121
2,3,Male,139,123,150,143.0,73.3,1038437
3,4,Male,133,129,128,172.0,68.8,965353
4,5,Female,137,132,134,147.0,65.0,951545
5,6,Female,99,90,110,146.0,69.0,928799
6,7,Female,138,136,131,138.0,64.5,991305
7,8,Female,92,90,98,175.0,66.0,854258
8,9,Male,89,93,84,134.0,66.3,904858
9,10,Male,133,114,147,172.0,68.8,955466


In [5]:
t = np.linspace(-6, 6, 20)
sin_t = np.sin(t)
cos_t = np.cos(t)

In [6]:
pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t})  

Unnamed: 0,t,sin,cos
0,-6.0,0.279415,0.96017
1,-5.368421,0.792419,0.609977
2,-4.736842,0.999701,0.024451
3,-4.105263,0.821291,-0.570509
4,-3.473684,0.326021,-0.945363
5,-2.842105,-0.29503,-0.955488
6,-2.210526,-0.802257,-0.596979
7,-1.578947,-0.999967,-0.008151
8,-0.947368,-0.811882,0.583822
9,-0.315789,-0.310567,0.950551


In [7]:
data.shape

(40, 8)

In [8]:
data.columns 

Index(['Unnamed: 0', 'Gender', 'FSIQ', 'VIQ', 'PIQ', 'Weight', 'Height',
       'MRI_Count'],
      dtype='object')

In [9]:
# Simpler selector
data[data['Gender'] == 'Female']['VIQ'].mean()

np.float64(109.45)

In [10]:
data[data['Gender'] == 'Female']['VIQ'].mean()

np.float64(109.45)

In [11]:
data.describe()

Unnamed: 0.1,Unnamed: 0,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
count,40.0,40.0,40.0,40.0,38.0,39.0,40.0
mean,20.5,113.45,112.35,111.025,151.052632,68.525641,908755.0
std,11.690452,24.082071,23.616107,22.47105,23.478509,3.994649,72282.05
min,1.0,77.0,71.0,72.0,106.0,62.0,790619.0
25%,10.75,89.75,90.0,88.25,135.25,66.0,855918.5
50%,20.5,116.5,113.0,115.0,146.5,68.0,905399.0
75%,30.25,135.5,129.75,128.0,172.0,70.5,950078.0
max,40.0,144.0,150.0,150.0,192.0,77.0,1079549.0


In [12]:
groupby_gender = data.groupby('Gender')

In [13]:
for gender, value in groupby_gender['VIQ']:
    print((gender, value.mean()))

('Female', np.float64(109.45))
('Male', np.float64(115.25))


In [14]:
groupby_gender.mean()

Unnamed: 0_level_0,Unnamed: 0,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,19.65,111.9,109.45,110.45,137.2,65.765,862654.6
Male,21.35,115.0,115.25,111.6,166.444444,71.431579,954855.4


# Exercise 2

In [15]:
# Assuming `data` is your full DataFrame and 'Gender' is the column with gender info
# First, filter the DataFrame to include only males
males = data[data['Gender'] == 'Male']

# Now plot the scatter matrix
scatter_matrix(males[['PIQ', 'VIQ', 'FSIQ']], figsize=(8, 8))
plt.suptitle("Scatter Matrix for Males Only")
plt.show()

NameError: name 'scatter_matrix' is not defined

In [None]:
# 1. Mean VIQ for the full population
viq_mean = data['VIQ'].mean()

# 2. Number of males and females
gender_counts = data['Gender'].value_counts()

# 3. Average MRI count (log10‐transformed) for males and females
data['log_MRI'] = np.log10(data['MRI_Count'])
mean_log_mri_by_gender = data.groupby('Gender')['log_MRI'].mean()

print(f"Mean VIQ (all participants): {viq_mean:.2f}")
print("\nParticipants by gender:")
print(gender_counts)
print("\nMean log10(MRI count) by gender:")
print(mean_log_mri_by_gender)

## Plotting the data

In [None]:
from pandas import plotting


In [None]:
plotting.scatter_matrix(data[['Weight', 'Height', 'MRI_Count']])   


In [None]:
plotting.scatter_matrix(data[['PIQ', 'VIQ', 'FSIQ']])   

In [None]:
from scipy import stats

# 3.1.2 Hypothesis testing

## 3.1.2.1 Student's t-test: simple stats

### 1 sample t test

In [None]:
stats.ttest_1samp(data['VIQ'], 0)   

### 2 sample t test

In [None]:
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq)   

## 3.1.2.2. Paired tests

In [None]:
stats.ttest_ind(data['FSIQ'], data['PIQ'])   

In [None]:
stats.ttest_rel(data['FSIQ'], data['PIQ'])   

In [None]:
stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)   

In [None]:
stats.wilcoxon(data['FSIQ'], data['PIQ'])   

# 3.1.3 Linear models, mmultiple factors, and analysis of variance 

## 3.1.3.1 "formulas" to specify statistical models in Python

In [None]:
x = np.linspace(-5, 5, 20)
np.random.seed(1)
# normal distributed noise
y = -5 + 3*x + 4 * np.random.normal(size=x.shape)
# Create a data frame containing all the relevant variables
data = pandas.DataFrame({'x': x, 'y': y})

In [None]:
from statsmodels.formula.api import ols
model = ols("y ~ x", data).fit()

In [None]:
print(model.summary())  

In [None]:
data = pandas.read_csv('examples/brain_size.csv', sep=';', na_values=".")


In [None]:
model = ols("VIQ ~ Gender + 1", data).fit()
print(model.summary()) 

In [None]:
model = ols('VIQ ~ C(Gender)', data).fit()


In [None]:
data_fisq = pandas.DataFrame({'iq': data['FSIQ'], 'type': 'fsiq'})
data_piq = pandas.DataFrame({'iq': data['PIQ'], 'type': 'piq'})
data_long = pandas.concat((data_fisq, data_piq))
print(data_long) 

In [None]:
model = ols("iq ~ type", data_long).fit()
print(model.summary()) 

## 3.1.2.2. Paired tests: repeated measurements on the same individuals

In [None]:
stats.ttest_ind(data['FSIQ'], data['PIQ'])

In [None]:
stats.ttest_rel(data['FSIQ'], data['PIQ'])   

In [None]:
stats.ttest_1samp(data['FSIQ'] - data['PIQ'], 0)   


In [None]:
stats.wilcoxon(data['FSIQ'], data['PIQ'])   


## Exercise 3

# 3.1.3. Linear models, multiple factors, and analysis of variance

## 3.1.3.1. “formulas” to specify statistical models in Python

In [None]:
import numpy as np
x = np.linspace(-5, 5, 20)
np.random.seed(1)
# normal distributed noise
y = -5 + 3*x + 4 * np.random.normal(size=x.shape)
# Create a data frame containing all the relevant variables
data = pandas.DataFrame({'x': x, 'y': y})

In [None]:
from statsmodels.formula.api import ols
model = ols("y ~ x", data).fit()

In [None]:
print(model.summary()) 

# Exercise 

In [None]:
data = pandas.read_csv('examples/brain_size.csv', sep=';', na_values=".")
model = ols("VIQ ~ Gender + 1", data).fit()
print(model.summary())  

In [None]:
model = ols('VIQ ~ C(Gender)', data).fit()


In [None]:
data_fisq = pandas.DataFrame({'iq': data['FSIQ'], 'type': 'fsiq'})
data_piq = pandas.DataFrame({'iq': data['PIQ'], 'type': 'piq'})
data_long = pandas.concat((data_fisq, data_piq))
print(data_long)  

In [None]:
model = ols("iq ~ type", data_long).fit()
print(model.summary())  stats.ttest_ind(data['FSIQ'], data['PIQ'])   

In [None]:
stats.ttest_ind(data['FSIQ'], data['PIQ'])   

## 3.1.3.2. Multiple Regression: including multiple factors

In [None]:
data = pandas.read_csv('examples/iris.csv')
model = ols('sepal_width ~ name + petal_length', data).fit()
print(model.summary())  

## 3.1.3.3 Post-hoc hypothesis testing: analysis of variance (ANOVA)

In [None]:
print(model.f_test([0, 1, -1, 0]))  

# Exercise

# 3.1.4. More visualization: seaborn for statistical exploration

## 3.1.4.1. Pairplot: scatter matrices

In [None]:
import seaborn
seaborn.pairplot(data, vars=['WAGE', 'AGE', 'EDUCATION'],
                 kind='reg')  

In [None]:
seaborn.pairplot(data, vars=['WAGE', 'AGE', 'EDUCATION'],
                 kind='reg', hue='SEX')  

In [None]:
from matplotlib import pyplot as plt
plt.rcdefaults()

## 3.1.4.2. lmplot: plotting a univariate regression

In [None]:
seaborn.lmplot(y='WAGE', x='EDUCATION', data=data)  

# 3.1.5. Testing for interactions

In [None]:
result = sm.ols(formula='wage ~ education + gender + education * gender',
                data=data).fit()    
print(result.summary())  