# Rows and Columns

## Spark Session

In [2]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

## Read Data Frame

In [3]:
df = (spark
      .read
      .parquet('data/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

## Columns

In [4]:
df.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [5]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

In [6]:
df.describe().show()

+-------+-----------------+-------------------+------------------+
|summary|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|             count|
+-------+-----------------+-------------------+------------------+
|  count|              255|                255|               255|
|   mean|             null|               null| 1655.956862745098|
| stddev|             null|               null|21801.481975969557|
|    min|      Afghanistan|        Afghanistan|                 1|
|    max|          Vietnam|            Vietnam|            348113|
+-------+-----------------+-------------------+------------------+



In [8]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [11]:
df.select('count').show(5)

+-----+
|count|
+-----+
|    1|
|  264|
|   69|
|   24|
|    1|
+-----+
only showing top 5 rows



In [14]:
df.filter(df['count'] > 10).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
+-----------------+-------------------+-----+
only showing top 5 rows



In [15]:
# rename column
df.withColumnRenamed('DEST_COUNTRY_NAME', 'dest_country_name').show(5)

+-----------------+-------------------+-----+
|dest_country_name|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [16]:
# create a new column, apply function
df.withColumn('0.001*count', df['count']/1000.).show(5)

+--------------------+-------------------+-----+-----------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|0.001*count|
+--------------------+-------------------+-----+-----------+
|       United States|            Romania|    1|      0.001|
|       United States|            Ireland|  264|      0.264|
|       United States|              India|   69|      0.069|
|               Egypt|      United States|   24|      0.024|
|   Equatorial Guinea|      United States|    1|      0.001|
|       United States|          Singapore|   25|      0.025|
|       United States|            Grenada|   54|      0.054|
|          Costa Rica|      United States|  477|      0.477|
|             Senegal|      United States|   29|      0.029|
|       United States|   Marshall Islands|   44|      0.044|
|              Guyana|      United States|   17|      0.017|
|       United States|       Sint Maarten|   53|      0.053|
|               Malta|      United States|    1|      0.001|
|             Bolivia|  

In [17]:
df.withColumn('count_duplicate', df['count']).show(5)

+-----------------+-------------------+-----+---------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|count_duplicate|
+-----------------+-------------------+-----+---------------+
|    United States|            Romania|    1|              1|
|    United States|            Ireland|  264|            264|
|    United States|              India|   69|             69|
|            Egypt|      United States|   24|             24|
|Equatorial Guinea|      United States|    1|              1|
+-----------------+-------------------+-----+---------------+
only showing top 5 rows



In [19]:
# drop column
df.drop('count_duplicate').show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

## Rows

In [25]:
rows = df.filter(df['count'] > 100).collect()
row = rows[0]
row

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=264)

In [24]:
len(rows)

106

In [31]:
# ?
row.count('Ireland') # is 'Ireland' in list ['United States', 'Ireland', '264']

1

In [34]:
# ?
row.index(264) # [0, 1, 2 <--]  

2

In [36]:
# row to dict
dict = row.asDict()
dict

{'DEST_COUNTRY_NAME': 'United States',
 'ORIGIN_COUNTRY_NAME': 'Ireland',
 'count': 264}

## Close Session

In [44]:
spark.stop()