# Return simple statistics about the dataset

In [1]:
import scipy.stats as stats

In [3]:
# Calculates r, r^2 and p values for two arrays, for example, predicted value and test value
# R^2 shows how well two arrays are correlated. 1 is perfect correlation, 0 is no correlation
# Sign of R shows whether it is forward or reverse correlation
# p shows what is the chance the correlation is just happend by chance
# p = 0.1 - 0.5 means correlation totally happend randomly
# p = 0.05 is pretty high chance for a random correlation
# p = 0 is no chance that correlation may happend randomly

def StatPearson(Y_predicted, Y_real):
    r, p = stats.pearsonr(Y_predicted, Y_real)
    r2 = r**2
    return r, r2, p

## Perfect reverse correlation

In [4]:
a = range(0, 10)

In [5]:
b = range(10, 0, -1)

In [6]:
a, b

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [8]:
StatPearson(a, b)

(-1.0, 1.0, 0.0)

## Perfect correlation

In [10]:
a = range(0, 10)
b = range(10, 20)

In [11]:
a, b

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [12]:
StatPearson(a, b)

(1.0, 1.0, 0.0)

## No correlation

In [14]:
import random as rnd

In [18]:
# Generating random values
a = []
b = []
for i in range(0, 10):
    a.append(rnd.random())
    b.append(rnd.random())

In [19]:
a, b

([0.439107486297013,
  0.7191397622054254,
  0.44218235496087,
  0.07596894054440462,
  0.09696229533255452,
  0.8866485005663657,
  0.4367188695876548,
  0.3829288722577606,
  0.5851038787492477,
  0.8345491798587389],
 [0.4017323224949453,
  0.6339516376630545,
  0.5897068816186919,
  0.4575395825572981,
  0.888288328325715,
  0.6062387025328965,
  0.33894294754121745,
  0.8784810032700273,
  0.6659731574166528,
  0.227543649142389])

In [20]:
StatPearson(a, b)

(-0.31906323363743427, 0.10180134705917597, 0.36886676957603959)