06-combine-two-mechanisms.ipynb
======================

In this section, we use the two tables from the previous sections to combine the two different mechanisms.

## Load packages and read tables.

In [None]:
from pyspark.sql.functions import col, lit, when
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Read a summary table of the victimisation-based mechanism.
vic_tab = spark.read.parquet("s3://jinny-capstone-data-test/summary-tables/emp-net/vic_tab.parquet")
vic_tab.registerTempTable("vic_tab")

# Read a summary table of the observation-based mechanism.
obs_tab = spark.read.parquet("s3://jinny-capstone-data-test/summary-tables/emp-net/obs_tab.parquet")
obs_tab.registerTempTable("obs_tab")

## 1. Merge two tables.

In [None]:
merged_tab = vic_tab.join(obs_tab, on=['id', 'start_date'], how='outer')
merged_tab.show(20)
# merged_tab.fillna(0, subset=['a', 'b'])

## 2. Get the number of times the motif appears in the network.

In [None]:
# Find the number of cheaters who adopted cheating after being killed by cheating once and observing two cheaters.
merged_tab.registerTempTable("merged_tab")

motifs = spark.sql("SELECT * FROM merged_tab WHERE total_exp = 1 AND uniq_cheaters = 2")
motifs.show()
print(motifs.count())

## 3. Plot the distribution of experiences and observations.

In [None]:
# bins = np.arange(0, complete_rows['period'].max() + 1.5) - 0.5
fig = merged_tab.scatter(merged_tab['total_exp'], merged_tab['total_obs'])
plt.title('')
# plt.xlim(xmin = 0)
# plt.xlim(xmax = 1)
plt.xlabel('Number of total victimisation experiences')
plt.ylabel('Number of total observations')
plt.tight_layout()
plt.show()