# PANDAS vs RDDs 

In [1]:
import pandas as pd 
import numpy as np 
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Test") \
    .getOrCreate()

# Dataframe from LIST 

In [2]:
#create Dataframe from List 

v_list = [['a','b','c'],['dddd','ddd','dddd'],['e','g','f']]
v_header = ['A' , 'B', 'C']

pd_df = pd.DataFrame(v_list ,columns=v_header)
pd_df

Unnamed: 0,A,B,C
0,a,b,c
1,dddd,ddd,dddd
2,e,g,f


In [3]:

#DATFRAME in spark has to be of same type .Column A cannot have int and str like tables in database .Pandas are flexible 
spark.createDataFrame(v_list,v_header).show()

+----+---+----+
|   A|  B|   C|
+----+---+----+
|   a|  b|   c|
|dddd|ddd|dddd|
|   e|  g|   f|
+----+---+----+



# Dataframe from DICTIONARY 

In [4]:
v_dict = {'a':[1,1,2] ,'b':[2,3,3] , 'c':[3,4,4] }

In [5]:
pd.DataFrame(v_dict)

Unnamed: 0,a,b,c
0,1,2,3
1,1,3,4
2,2,3,4


In [6]:
#not easy :)
spark.createDataFrame(np.array(list(v_dict.values())).T.tolist(),list(v_dict.keys())).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  3|  4|
|  2|  3|  4|
+---+---+---+



# FROM Database - Pyspark requires drivers for the database as compared to PANDAS 


# FROM CSV 

In [7]:
df_pd_csv=pd.read_csv('/root/golive/learning-apache-spark/data/Advertising.csv')
df_pd_csv.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [8]:
df_sp_csv=spark.read.csv('/root/golive/learning-apache-spark/data/Advertising.csv')

In [9]:
df_sp_csv.show(4)

+-----+-----+---------+-----+
|  _c0|  _c1|      _c2|  _c3|
+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
+-----+-----+---------+-----+
only showing top 4 rows



In [10]:
#was not reading header , had to specify explicitly 
df_sp_csv=spark.read.csv('/root/golive/learning-apache-spark/data/Advertising.csv',header=True)
df_sp_csv.show(4)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
+-----+-----+---------+-----+
only showing top 4 rows



In [11]:
df_p_json = pd.read_json('/root/golive/learning-apache-spark/data/data.json')
df_p_json.head(3)
           
                     

Unnamed: 0,id,sampling_rate,timestamp,location,sensor,sensordatavalues
0,1111920206,,2020-07-09 00:05:01,"{'id': 9158, 'latitude': '51.528', 'longitude'...","{'id': 18066, 'pin': '1', 'sensor_type': {'id'...","[{'id': 2395525594, 'value': '5.10', 'value_ty..."
1,1111920205,,2020-07-09 00:05:01,"{'id': 22674, 'latitude': '50.85127481584', 'l...","{'id': 36685, 'pin': '1', 'sensor_type': {'id'...","[{'id': 2395525587, 'value': '21.85', 'value_t..."
2,1111920204,,2020-07-09 00:05:01,"{'id': 16273, 'latitude': '51.234', 'longitude...","{'id': 29249, 'pin': '11', 'sensor_type': {'id...","[{'id': 2395525590, 'value': '12.38', 'value_t..."


In [12]:
# VERY similar results. The columns in different order ?? . 
df_s_json = spark.read.json('/root/golive/learning-apache-spark/data/data.json')
df_s_json.show(3)


+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|        id|            location|sampling_rate|              sensor|    sensordatavalues|          timestamp|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|1111920206|[60.1, DE, 0, 915...|         null|[18066, 1, [14, N...|[[2395525594, 5.1...|2020-07-09 00:05:01|
|1111920205|[41.8, BE, 1, 226...|         null|[36685, 1, [14, N...|[[2395525587, 21....|2020-07-09 00:05:01|
|1111920204|[423.8, DE, 0, 16...|         null|[29249, 11, [17, ...|[[2395525590, 12....|2020-07-09 00:05:01|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
only showing top 3 rows



In [13]:
#COLUMN Name & TYPE 
df_p_json.dtypes

id                           int64
sampling_rate              float64
timestamp           datetime64[ns]
location                    object
sensor                      object
sensordatavalues            object
dtype: object

In [14]:
df_s_json.dtypes

[('id', 'bigint'),
 ('location',
  'struct<altitude:string,country:string,exact_location:bigint,id:bigint,indoor:bigint,latitude:string,longitude:string>'),
 ('sampling_rate', 'bigint'),
 ('sensor',
  'struct<id:bigint,pin:string,sensor_type:struct<id:bigint,manufacturer:string,name:string>>'),
 ('sensordatavalues',
  'array<struct<id:bigint,value:string,value_type:string>>'),
 ('timestamp', 'string')]

In [15]:
#Fill NULL VALUES = .fillna('value')
#Replace values .replace ('[old_v1 , oldv2 ]' ,['new1',new2'])
df_pd_csv.head(4)
df_sp_csv.show(4)


+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
+-----+-----+---------+-----+
only showing top 4 rows



In [16]:
#Rename columns - selective 
mapping = {'Newspaper':'C','Sales':'D'}
df_pd_csv.rename(columns=mapping,inplace=True)
df_pd_csv.head()

Unnamed: 0,TV,Radio,C,D
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [17]:
df_sp_csv.withColumnRenamed('Newspaper','C').show(5)

+-----+-----+----+-----+
|   TV|Radio|   C|Sales|
+-----+-----+----+-----+
|230.1| 37.8|69.2| 22.1|
| 44.5| 39.3|45.1| 10.4|
| 17.2| 45.9|69.3|  9.3|
|151.5| 41.3|58.5| 18.5|
|180.8| 10.8|58.4| 12.9|
+-----+-----+----+-----+
only showing top 5 rows



In [18]:
#DROP columns 
#df_pd_csv
drop_c = ['C', 'D']
df_pd_csv.drop(drop_c,axis=1).head()
#df_pd_csv.head()


Unnamed: 0,TV,Radio
0,230.1,37.8
1,44.5,39.3
2,17.2,45.9
3,151.5,41.3
4,180.8,10.8


In [19]:
drop_c = ['Newspaper', 'Sales']
df_sp_csv.drop(*drop_c).show(4)

+-----+-----+
|   TV|Radio|
+-----+-----+
|230.1| 37.8|
| 44.5| 39.3|
| 17.2| 45.9|
|151.5| 41.3|
+-----+-----+
only showing top 4 rows



In [20]:
#Filter # cannot use "and" or "or" instead have to use & and | 

df_pd_csv[(df_pd_csv.TV > 200 ) & (df_pd_csv.Radio >40)].head()

Unnamed: 0,TV,Radio,C,D
36,266.9,43.8,5.0,25.4
47,239.9,41.5,18.5,23.2
52,216.4,41.7,39.6,22.6
58,210.8,49.6,37.7,23.8
61,261.3,42.7,54.7,24.2


In [21]:
df_sp_csv[(df_sp_csv.Newspaper<20)&(df_sp_csv.TV>100)].show(4)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|120.2| 19.6|     11.6| 13.2|
|214.7|   24|        4| 17.4|
|147.3| 23.9|     19.1| 14.6|
|262.9|  3.5|     19.5|   12|
+-----+-----+---------+-----+
only showing top 4 rows



In [22]:
# ADD A NEW COLUMN 
df_pd_csv['new_col'] = df_pd_csv['TV']/sum(df_pd_csv['TV']) 
df_pd_csv.head()

Unnamed: 0,TV,Radio,C,D,new_col
0,230.1,37.8,69.2,22.1,0.007824
1,44.5,39.3,45.1,10.4,0.001513
2,17.2,45.9,69.3,9.3,0.000585
3,151.5,41.3,58.5,18.5,0.005152
4,180.8,10.8,58.4,12.9,0.006148


In [23]:
#GROUP BY - 
# withColum groubyBY().agg aggregate function - collect() to actuallly run it 
# SYNTAX studentsDF
#  .groupBy("continent", "country")
#  .agg(count("*"))
#  .show()
import pyspark.sql.functions as F
df_sp_csv.groupBy("TV").agg(F.sum("TV")).show(4)
df_sp_csv.groupBy("TV","RADIO").agg(F.sum("TV")).show(4)


+-----+-------+
|   TV|sum(TV)|
+-----+-------+
|262.7|  262.7|
|  7.3|    7.3|
|134.3|  134.3|
|234.5|  234.5|
+-----+-------+
only showing top 4 rows

+-----+-----+-------+
|   TV|RADIO|sum(TV)|
+-----+-----+-------+
| 17.2| 45.9|   17.2|
|240.1| 16.7|  240.1|
|131.1| 42.8|  131.1|
| 73.4|   17|   73.4|
+-----+-----+-------+
only showing top 4 rows



# SQLs on dataframe 

Steps :
1. Dataframe to TEMP VIEW : # Register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("people")
    
        

In [24]:
df_sp_csv.createOrReplaceTempView('spark_df')
sql_out = spark.sql("select TV , sum(RADIO) from spark_df where TV > 100  group by TV order by TV asc ")

In [25]:
sql_out.show(5)

+-----+--------------------------+
|   TV|sum(CAST(RADIO AS DOUBLE))|
+-----+--------------------------+
|102.7|                      29.6|
|104.6|                       5.7|
|107.4|                      14.0|
|109.8|        62.099999999999994|
|110.7|                      40.6|
+-----+--------------------------+
only showing top 5 rows



In [26]:
spark.stop()