# Pyspark Beginner

In [None]:
pip install pyspark

# **Create Spark Session**

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('spark-dataframe-demo').getOrCreate()

In [None]:
spark

# 01. Create DataFrame (w/ and w/o schema)

**create Row**

In [None]:
from pyspark.sql import Row

# define row
Employee = Row("firstName", "lastName", "email", "salary")

# create row instances
e1 = Employee('x', None, 'y', 100)
e2 = Employee('z', 'x', 'a', 99.9)

In [None]:
print(type(Employee), '....' , Employee[0])
print(type(e1), '....' , e1[0])

print("="*50)
print(Employee)
print(e1)

**Create table(dataframe): List of Rows**

In [None]:
row = Row("col1Name", "col2Name")

r1 = row(101, 'a')
r2 = Row(col1Name=102, col2Name='b') # explicitly
r3 = Row(col1Name=103, col2Name='c')


dframe = spark.createDataFrame([r1, r2, r3])
display(dframe)

In [None]:
dframe.show()

In [None]:
# explicit schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField('col1Name', IntegerType(), nullable=True), 
    StructField('col2Name', StringType(), nullable=True)
])

r4 = row(104, None)
dframe = spark.createDataFrame([r1, r2, r3, r4], schema=schema)
display(dframe) # note types

In [None]:
dframe.show()

# **2. DataFrame from CSV**

In [None]:
CSV_PATH = "../input/facebook-keyword-extraction-competition/Train.csv"

**Implicit schema**

In [None]:
df = spark.read.csv(CSV_PATH, inferSchema = True, header = True)
display(df) # note infered schema types

In [None]:
df.show() # display values

> Note nulls! 

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField('Id', IntegerType(), nullable=False), 
    StructField('Title', StringType(), nullable=False),
    StructField('Body', StringType(), nullable=False),
    StructField('Tags', StringType(), nullable=False),
])

df2 = spark.read.csv(path=CSV_PATH, schema=schema)
display(df2) # note explicit schema types

In [None]:
df.show()