## Example 3 - Permutation Test

In [5]:
import pandas as pd
import numpy as np

In [13]:
import findspark
findspark.init()

In [14]:
import pyspark
sc = pyspark.SparkContext(appName="MyAppName")

In [15]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)

In [2]:
data = pd.read_csv("mammals.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,body,brain
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


In [6]:
observed_correlation = np.corrcoef(data.body, data.brain) [0,1]
print(observed_correlation)

0.9341638423233545


In [8]:
def simulate():
    return np.corrcoef(data.body, np.random.permutation(data. brain) ) [0,1]

In [9]:
n_sim = 10000
simulation = [ simulate() for i in range(n_sim) ]
p_value = sum([observed_correlation<=random_correlation
                for random_correlation in simulation ]) / n_sim
print(p_value)

0.0


So far, all our simulations were carried out on one core of our computer. We can use Spark to perform
this calculation across more cores and potentially also different computers in the Spark cluster

In [10]:
n_batch = 10
n_sim_per_batch = 1000

In [11]:
def simulate_batch(seed):
    np.random.seed (seed)
    return [ simulate() for i in range(n_sim_per_batch) ]

In [16]:
simulation = sc.parallelize(range(n_batch)).map(simulate_batch).collect()

In [17]:
p_value = sum([observed_correlation<=random_correlation
                        for block in simulation
for random_correlation in block ]) / n_sim
print(p_value)

0.0
