## 6 MERGING TWO MECHANISMS

In [1]:
import pandas as pd
import merge_data as md

In [2]:
VIC_DATA_PATH = "s3://social-research-cheating/summary-tables/emp-net/vic.parquet"
SIM_OBS_DATA_PATH = "s3://social-research-cheating/summary-tables/emp-net/simple_obs.parquet"
STR_OBS_DATA_PATH = "s3://social-research-cheating/summary-tables/emp-net/strict_obs.parquet"

vic_data = md.get_transitions(VIC_DATA_PATH, 7)
vic_data.registerTempTable("vic_data")
vic_data.show(5)
vic_df = vic_data.toPandas()

+--------------------+----------+----------+------+---------+-------------------+
|                  id|start_date|    m_date|period|total_exp|total_severe_damage|
+--------------------+----------+----------+------+---------+-------------------+
|account.393b6ee09...|2019-03-08|2019-03-01|     7|        2|                  1|
|account.3f888b606...|2019-03-10|2019-03-05|     5|        7|                  4|
|account.4b6c650bd...|2019-03-15|2019-03-11|     4|        2|                  0|
|account.73e50018a...|2019-03-27|2019-03-20|     7|        1|                  1|
|account.b3e7bbe4a...|2019-03-09|2019-03-04|     5|        1|                  0|
+--------------------+----------+----------+------+---------+-------------------+
only showing top 5 rows



### 6.1 MERGING TWO TABLES FOR EACH DEFINITION

In [3]:
obs_data = md.get_transitions(SIM_OBS_DATA_PATH, 7)
obs_data.registerTempTable("obs_data")
obs_data.show(5)
obs_df = obs_data.toPandas()

frequency_table = md.merge_tables(vic_df, obs_df, 0)

print(frequency_table.head())
print(frequency_table.tail())
frequency_table.to_csv('emp_data.csv', index=False)

+--------------------+----------+----------+------+---------+
|                  id|start_date|    m_date|period|total_obs|
+--------------------+----------+----------+------+---------+
|account.debf70388...|2019-03-03|2019-03-02|     1|        2|
|account.bed69e8d7...|2019-03-03|2019-03-02|     1|        4|
|account.14a9052f2...|2019-03-08|2019-03-01|     7|       27|
|account.516b36389...|2019-03-05|2019-03-03|     2|        2|
|account.89e89a88e...|2019-03-08|2019-03-02|     6|       10|
+--------------------+----------+----------+------+---------+
only showing top 5 rows

   total_obs  total_exp  freq
0          0          0   183
1          0          1   101
2          0          2    22
3          0          3     9
4          0          4     1
     total_obs  total_exp  freq
115         43          3     1
116         58          7     1
117         64          4     1
118         65          2     1
119        100          2     1


In [6]:
obs_data = md.get_transitions(STR_OBS_DATA_PATH, 7)
obs_data.registerTempTable("obs_data")
obs_data.show(5)
obs_df = obs_data.toPandas()

frequency_table = md.merge_tables(vic_df, obs_df, 1)

print(frequency_table.head())
print(frequency_table.tail())
frequency_table.to_csv('emp_data.csv', index=False)

+--------------------+----------+----------+------+---------+
|                  id|start_date|    m_date|period|total_obs|
+--------------------+----------+----------+------+---------+
|account.14a9052f2...|2019-03-08|2019-03-01|     7|        8|
|account.89e89a88e...|2019-03-08|2019-03-02|     6|        5|
|account.8d20619cf...|2019-03-06|2019-03-05|     1|        1|
|account.ed34dc343...|2019-03-10|2019-03-03|     7|        0|
|account.0225f02f2...|2019-03-05|2019-03-03|     2|        0|
+--------------------+----------+----------+------+---------+
only showing top 5 rows

   total_obs  total_exp  freq
0          0          0   798
1          0          1    58
2          0          2     4
3          0          3     1
4          1          0   391
    total_obs  total_exp  freq
55         19          4     1
56         25          1     1
57         26          0     1
58         27          2     1
59         37          0     1


### 6.2 STORING RESULTS IN CSV FILES

In [4]:
for i in range(1, 6):
    md.put_summary_table_in_csv_file(i, 0)
    
simple_results = md.create_merged_csv_file("emp_data.csv", "rand_data_1.csv", 5)
simple_results.to_csv("RO_RE.csv", index=False)

print(simple_results.head())
print(simple_results.tail())

   total_obs  total_exp      E     R1     R2     R3     R4     R5
0          0          0  183.0  179.0  161.0  167.0  172.0  179.0
1          0          1  101.0  100.0  106.0  100.0  108.0  106.0
2          0          2   22.0   27.0   23.0   28.0   15.0   23.0
3          0          3    9.0    5.0    9.0    5.0   10.0    5.0
4          0          4    1.0    1.0    2.0    0.0    3.0    4.0
     total_obs  total_exp    E   R1   R2   R3   R4   R5
201         99          2  0.0  1.0  0.0  0.0  0.0  0.0
202         99          3  0.0  0.0  0.0  0.0  1.0  0.0
203        100          2  1.0  0.0  0.0  0.0  0.0  0.0
204        101          2  0.0  0.0  1.0  0.0  0.0  0.0
205        102          2  0.0  0.0  0.0  1.0  0.0  0.0


In [7]:
for i in range(1, 6):
    md.put_summary_table_in_csv_file(i, 1)

strict_results = md.create_merged_csv_file("emp_data.csv", "rand_data_1.csv", 5)
strict_results.to_csv("SO_SE.csv", index=False)

print(strict_results.head())
print(strict_results.tail())

   total_obs  total_exp      E     R1     R2     R3     R4     R5
0          0          0  798.0  798.0  805.0  809.0  790.0  830.0
1          0          1   58.0   73.0   55.0   70.0   66.0   55.0
2          0          2    4.0    5.0    9.0    8.0   10.0   10.0
3          0          3    1.0    1.0    0.0    1.0    1.0    0.0
4          1          0  391.0  403.0  382.0  388.0  404.0  397.0
    total_obs  total_exp    E   R1   R2   R3   R4   R5
87         37          0  1.0  0.0  0.0  0.0  0.0  0.0
88         39          0  0.0  0.0  0.0  1.0  0.0  0.0
89         40          0  0.0  0.0  1.0  0.0  0.0  0.0
90         42          0  0.0  0.0  0.0  0.0  1.0  1.0
91         43          0  0.0  1.0  0.0  0.0  0.0  0.0
