In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

#  Biserial, Point biserial,  Partial Correlation and Semi-Partial Correlation

## Note:
### point-biserial correlation coefficient : discrete dichotomy
###  biserial correlation : continuous dichotomy 

In [2]:
cat_data = pd.read_csv('/home/atrides/Desktop/R/statistics_with_Python/06_Correlation/Data_Files/pbcorr.csv')

### Point biserial

In [5]:
# point-biserial correlation with scipy.stats.pointbiserialr
print(st.pointbiserialr(cat_data['time'], cat_data['gender']))

PointbiserialrResult(correlation=0.378454249471916, pvalue=0.002867597554256142)


In [6]:
# point-biserial correlation for time and gender can also be obtained by
r = cat_data['time'].corr(cat_data['gender'])
print(r)

0.37845424947191597


In [11]:
# Now for confidence interval , as we have learnt r, doesn't have a normal sampling distribution, but Fisher has given us a way anyways

Z_r = np.arctanh(r)
print(Z_r)

0.3982542599811433


In [12]:
N = len(cat_data)
SE_Zr = np.sqrt(1/(N-3))
lcb,ucb = Z_r-1.96*SE_Zr, Z_r+1.96*SE_Zr
(lcb, ucb) = np.tanh((lcb, ucb))
print((lcb, ucb))

(0.13776431357290136, 0.5769392253082637)


In [14]:
# Thus coefficient of determination will be:

r_squared = r**2
print(r_squared)

0.1432276189433512


### Biserial

In [7]:
# r_b = (r_pb * np.sqrt(p*q))/y
len_0 = len(cat_data[cat_data['gender']==0])
female_ratio = len_0/len(cat_data)
male_ratio = 1-female_ratio
q,p = (male_ratio, female_ratio)

In [8]:
y = 0.3977

In [9]:
r_biserial = r*np.sqrt(p*q)/y
print(r_biserial)

0.4747451640575249


### Partial

In [15]:
data = pd.read_csv('/home/atrides/Desktop/R/statistics_with_Python/06_Correlation/Data_Files/Exam Anxiety.dat', sep='\t')

In [16]:
data = data[['Revise', 'Exam', 'Anxiety']]
print(data.head())

   Revise  Exam  Anxiety
0       4    40   86.298
1      11    65   88.716
2      27    80   70.178
3      53    80   61.312
4       4    40   89.522


In [18]:
import pingouin as pg

In [20]:
print(data.pcorr())

           Revise      Exam   Anxiety
Revise   1.000000  0.132678 -0.648530
Exam     0.132678  1.000000 -0.246666
Anxiety -0.648530 -0.246666  1.000000


In [21]:
# Using pingouin
print(pg.partial_corr(data = data, x='Exam', y='Anxiety', covar='Revise'))

           n         r           CI95%        r2    adj_r2     p-val   BF10  \
pearson  103 -0.246666  [-0.42, -0.06]  0.060844  0.042061  0.012013  2.762   

            power  
pearson  0.715461  


### Semi-Partial Correlation

In [23]:
print(pg.partial_corr(data=data, x='Exam' , y='Anxiety', x_covar='Revise'))

           n         r          CI95%        r2    adj_r2     p-val   BF10  \
pearson  103 -0.173889  [-0.36, 0.02]  0.030237  0.010842  0.078977  0.564   

            power  
pearson  0.422393  


#### ·partial correlation -  quantifies the relationship between two variables while controlling for the effects of a third variable on both variables in the original correlation.

#### ·semi-partial correlation-  quantifies the relationship between two variables while controlling for the effects of a third variable on only one of the variables in the original correlation