In [1]:
import numpy as np
import pandas as pd

import pyspark




from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


# Set up the spark session
spark = SparkSession.builder \
   .master("spark://spark-master:7077") \
   .appName("coords_and_links") \
   .config(conf = pyspark.SparkConf()) \
   .getOrCreate()



# Read the csv File

df1 = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('romain.csv')


# dfsender is a dataframe with single couple (id,latitude,longitude) for the sender
dfsender = df1.dropDuplicates(['SENDER_SITE_LOC_ID', 'LAT_SENDER', 'LON_SENDER'])

# dfreceiver is a dataframe with single couple (id,latitude,longitude) for the receiver
dfreceiver = df1.dropDuplicates(['RECEIVER_SITE_LOC_ID', 'LAT_RECEIVER', 'LON_RECEIVER'])


# Select only the useful parameters
dfsender = dfsender.select(['SENDER_SITE_LOC_ID', 'LAT_SENDER', 'LON_SENDER'])
dfreceiver = dfreceiver.select(['RECEIVER_SITE_LOC_ID', 'LAT_RECEIVER', 'LON_RECEIVER'])


# Rename the parameters to merge
dfsender = dfsender.select(col("SENDER_SITE_LOC_ID").alias("id"), col("LAT_SENDER").alias("latitude"), col("LON_SENDER").alias("longitude"))
dfreceiver = dfreceiver.select(col("RECEIVER_SITE_LOC_ID").alias("id"), col("LAT_RECEIVER").alias("latitude"), col("LON_RECEIVER").alias("longitude"))


# df_concat is ready to be used, it is formatted and takes few element, pandas datafram can be called
df_concat = dfsender.union(dfreceiver)
df_concat.show()



# transform to pandas, can be done if the data are not too big. That is why the previous treatment id done in pyspark
dfpandas = df_concat.toPandas()
dataframe = df1.dropDuplicates(['SENDER_SITE_LOC_ID', 'RECEIVER_SITE_LOC_ID']).toPandas()


dfpandas.to_csv('coords.csv')
dataframe.to_csv('links.csv')

print(dfpandas)
print(dataframe)

spark.stop()



+-----+--------+---------+
|   id|latitude|longitude|
+-----+--------+---------+
|15695|   40.33|    -3.68|
| 8627|   51.82|    10.38|
|15793|   44.34|    26.03|
|15801|   59.21|     17.9|
|15705|   59.88|    10.13|
| 7597|   55.75|    37.63|
|16021|   48.83|     2.35|
|12034|   28.39|   76.748|
|15710|  51.446|    -0.18|
|15572|    39.9|    34.73|
| 8415|   33.23|    -5.61|
|50899|   41.88|    12.48|
+-----+--------+---------+

       id  latitude  longitude
0   15695    40.330     -3.680
1    8627    51.820     10.380
2   15793    44.340     26.030
3   15801    59.210     17.900
4   15705    59.880     10.130
5    7597    55.750     37.630
6   16021    48.830      2.350
7   12034    28.390     76.748
8   15710    51.446     -0.180
9   15572    39.900     34.730
10   8415    33.230     -5.610
11  50899    41.880     12.480
   _c0  SENDER_SITE_LOC_ID  RECEIVER_SITE_LOC_ID  ITEMS_TOTAL_AMOUNT  \
0   48               15793                 12034            23928076   
1    4              