In [0]:
#import
from pyspark.sql import SparkSession

#create spark session
Spark = SparkSession \
    .builder \
    .appName("app") \
    .getOrCreate()

In [0]:
#create PySpark RDD from Parallelize
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6])
print(rdd.collect())

[1, 2, 3, 4, 5, 6]


In [0]:
#create PySpark RDD from Tuple
data = [("Java",20000),("Python",10000),("Scala",30000)]
rdd = spark.sparkContext.parallelize(data)
print(rdd.collect())


[('Java', 20000), ('Python', 10000), ('Scala', 30000)]


In [0]:
# create RDD from range function
rddRange = spark.sparkContext.parallelize(range(1,6))
print(rddRange.collect())

[1, 2, 3, 4, 5]


In [0]:
#create RDD from another RDD
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
new_rdd = rdd.map(lambda x: x*2)
print(new_rdd.collect())

[2, 4, 6, 8, 10]


In [0]:
# import JSON
import json

#create RDD from JSON
# spark.sparkContext = sc we can use sc short
json_data = '{"name":"Kumar","age":39,"city":"New York"}'
rdd_json = sc.parallelize([json.loads(json_data)])
print(rdd_json.collect())

[{'name': 'Kumar', 'age': 39, 'city': 'New York'}]


In [0]:
data = [('James',3000),('Anna',4001),('Robert',6200)]
df = spark.createDataFrame(data,["name","salary"])
df.show()

#converts DataFrame to rdd
rdd = df.rdd
print(rdd.collect())

+------+------+
|  name|salary|
+------+------+
| James|  3000|
|  Anna|  4001|
|Robert|  6200|
+------+------+

[Row(name='James', salary=3000), Row(name='Anna', salary=4001), Row(name='Robert', salary=6200)]


In [0]:

from datetime import datetime,date
import pandas as pd
#need to import for session creation
from pyspark.sql import SparkSession,Row

#pyspark dataframe
rdd = spark.sparkContext.parallelize([
    (1,4.,'aa',date(2000,8,1), datetime(2000,8,1,12,0)),
    (2,8.,'bb',date(2000,6,2), datetime(2000,6,2,12,0)),
    (3,5.,'cc',date(2000,5,3), datetime(2000,5,3,12,0))
])

df =spark.createDataFrame(rdd,schema=['a','b','c','d','e'])

# show table
df.show()

# show schema
df.printSchema()


+---+---+---+----------+-------------------+
|  a|  b|  c|         d|                  e|
+---+---+---+----------+-------------------+
|  1|4.0| aa|2000-08-01|2000-08-01 12:00:00|
|  2|8.0| bb|2000-06-02|2000-06-02 12:00:00|
|  3|5.0| cc|2000-05-03|2000-05-03 12:00:00|
+---+---+---+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [0]:

#need to import for session creation
from pyspark.sql import SparkSession,Row

# creating the session
spark = SparkSession.builder.getOrCreate()

# schema creation by passing list
df=spark.createDataFrame([
     Row (a=1, b=4., c='rank1',d=date(2000,8,1),
        e=datetime(2000,8,1,12,0)),
     Row (a=2, b=8., c='rank2',d=date(2000,6,2),
        e=datetime(2000,6,2,12,0)),
     Row (a=4, b=5., c='rank3',d=date(2000,5,3),
        e=datetime(2000,5,3,12,0))
],schema=['a','b','c','d','e'])


# show table
df.show()

# show schema
df.printSchema()

+---+---+-----+----------+-------------------+
|  a|  b|    c|         d|                  e|
+---+---+-----+----------+-------------------+
|  1|4.0|rank1|2000-08-01|2000-08-01 12:00:00|
|  2|8.0|rank2|2000-06-02|2000-06-02 12:00:00|
|  4|5.0|rank3|2000-05-03|2000-05-03 12:00:00|
+---+---+-----+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [0]:
from datetime import datetime,date

import pandas as pd

# creating the session
spark = SparkSession.builder.getOrCreate()

#PySpark DataFrame from a pandas DataFrame
pandas_df =pd.DataFrame({
    'a' : [1,2,3],

    'b' : [4.,8.,5.],

    'c' : ['aa','bb','cc'],

    'd' : [date(2000,8,1), date(2000,6,2), date(2000,5,3)],

    'e' : [datetime(2000,8,1,12,0),
           datetime(2000,6,2,12,0),
           datetime(2000,5,3,12,0)]
})

df = spark.createDataFrame(pandas_df)
df

#show table
df.show()

# show schema
df.printSchema()

+---+---+---+----------+-------------------+
|  a|  b|  c|         d|                  e|
+---+---+---+----------+-------------------+
|  1|4.0| aa|2000-08-01|2000-08-01 12:00:00|
|  2|8.0| bb|2000-06-02|2000-06-02 12:00:00|
|  3|5.0| cc|2000-05-03|2000-05-03 12:00:00|
+---+---+---+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)

