# Apache Spark Basics

## working with spark context

In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

## working with sql context

In [3]:
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)

## define schema

In [19]:
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType

df_schema = StructType([StructField("Wave_Name", StringType(), True), StructField("Manufacturer", StringType(), True),
                       StructField("Gender", StringType(), True), StructField("Task", StringType(), True),
                       StructField("Label", FloatType(), True), StructField("Prediction", FloatType(), True)
          ])

## create data frame using specified schema and empty data

In [20]:
df = sqlCtx.createDataFrame(sc.emptyRDD(), df_schema) 

In [21]:
df.collect()

[]

## reading csv file into data frame

In [22]:
df = sqlCtx\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv('file:///home/devuser1/people-example.csv')

In [23]:
df.collect()

[Row(First Name=u'Bob', Last Name=u'Smith', Country=u'United States', age=24),
 Row(First Name=u'Alice', Last Name=u'Williams', Country=u'Canada', age=23),
 Row(First Name=u'Malcolm', Last Name=u'Jone', Country=u'England', age=22),
 Row(First Name=u'Felix', Last Name=u'Brown', Country=u'USA', age=23),
 Row(First Name=u'Alex', Last Name=u'Cooper', Country=u'Poland', age=23),
 Row(First Name=u'Tod', Last Name=u'Campbell', Country=u'United States', age=22),
 Row(First Name=u'Derek', Last Name=u'Ward', Country=u'Switzerland', age=25)]

## write data frame to a single csv file

In [25]:
df.repartition(1).write.csv('file:///home/devuser1/' + 'out.csv')