## Question

Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table.

## PySpark

### Setup

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("challenge").getOrCreate()
sqlContext = SparkSession(spark)
spark.sparkContext.setLogLevel("ERROR")

### Solution

In [3]:
station_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("data/stations.csv")
station_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [4]:
station_df.limit(5).show()

+---+---------+-----+-----------+-----------+
| ID|     City|State|  Lattitude|  Longitude|
+---+---------+-----+-----------+-----------+
|478|   Tipton|   IN|33.54792701|97.94286036|
|619|Arlington|   CO|75.17993079|92.94615894|
|711|   Turner|   AR|50.24380534|101.4580163|
|839|  Slidell|   LA|85.32270304|151.8743276|
|411|  Negreet|   LA| 98.9707194|105.3376115|
+---+---------+-----+-----------+-----------+



In [5]:
station_df.createOrReplaceTempView("tmpStation")
sqlContext.sql("SELECT * FROM tmpStation LIMIT 5").show()

+---+---------+-----+-----------+-----------+
| ID|     City|State|  Lattitude|  Longitude|
+---+---------+-----+-----------+-----------+
|478|   Tipton|   IN|33.54792701|97.94286036|
|619|Arlington|   CO|75.17993079|92.94615894|
|711|   Turner|   AR|50.24380534|101.4580163|
|839|  Slidell|   LA|85.32270304|151.8743276|
|411|  Negreet|   LA| 98.9707194|105.3376115|
+---+---------+-----+-----------+-----------+



Q - Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table

In [7]:
from pyspark.sql.functions import count, countDistinct

station_df.select(count("City") - countDistinct("City")).show(n=5)

+------------------------------------+
|(count(City) - count(DISTINCT City))|
+------------------------------------+
|                                   3|
+------------------------------------+



In [8]:
sqlContext.sql("SELECT count(city) as citycount, count(distinct(city)) as distinctcitycount \
                ,(count(city)  - count(distinct(city))) as diffbetweenboth \
                FROM tmpStation").show(n=5)

+---------+-----------------+---------------+
|citycount|distinctcitycount|diffbetweenboth|
+---------+-----------------+---------------+
|      282|              279|              3|
+---------+-----------------+---------------+



## Postgres

### Setup

In [9]:
import os
import pandas as pd

In [10]:
nb_path = os.path.join(os.getcwd(), 'utils/connect-postgres.ipynb')
%run {nb_path}

In [11]:
station_df = pd.read_csv("data/stations.csv")
station_df.head()

Unnamed: 0,ID,City,State,Lattitude,Longitude
0,478,Tipton,IN,33.547927,97.94286
1,619,Arlington,CO,75.179931,92.946159
2,711,Turner,AR,50.243805,101.458016
3,839,Slidell,LA,85.322703,151.874328
4,411,Negreet,LA,98.970719,105.337611


In [21]:
station_df.columns = [x.lower() for x in station_df.columns]

In [22]:
station_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         282 non-null    int64  
 1   city       282 non-null    object 
 2   state      282 non-null    object 
 3   lattitude  282 non-null    float64
 4   longitude  282 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 11.1+ KB


In [24]:
station_df.to_sql('station', con=conn_str, index=False, schema='public')

In [25]:
%%sql
SELECT * FROM public.station LIMIT 5

Unnamed: 0,id,city,state,lattitude,longitude
0,478,Tipton,IN,33.547927,97.94286
1,619,Arlington,CO,75.179931,92.946159
2,711,Turner,AR,50.243805,101.458016
3,839,Slidell,LA,85.322703,151.874328
4,411,Negreet,LA,98.970719,105.337611


In [26]:
%%sql
SELECT COUNT(*) FROM public.station

Unnamed: 0,count
0,282


### Solution

Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table

In [28]:
%%sql
SELECT count(city) as citycount,
    count(distinct(city)) as distinctcitycount,
(count(city) - count(distinct(city))) as diff_between_both
FROM public.station

Unnamed: 0,citycount,distinctcitycount,diff_between_both
0,282,279,3
