<a href="https://colab.research.google.com/github/saurin33/pyspark_basics/blob/master/pyspark_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName("DataFrameBasics").getOrCreate()

In [0]:
dataframe = spark.createDataFrame([
                                    (0, "Here is our DataFrame"),
                                    (1, "We are making one from scratch"),
                                    (2, "This will look ver similar to a Pandas DataFrame")
  ], ["id", "words"])


In [5]:
dataframe.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|Here is our DataF...|
|  1|We are making one...|
|  2|This will look ve...|
+---+--------------------+



In [0]:
# Read in data from S3 Buckets
from pyspark import SparkFiles

In [0]:
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/food.csv"
spark.sparkContext.addFile(url)

In [12]:
df = spark.read.csv(SparkFiles.get("food.csv"), sep=',', header=True)
df.show(10)

+-------+-----+
|   food|price|
+-------+-----+
|  pizza|    0|
|  sushi|   12|
|chinese|   10|
+-------+-----+



In [11]:
df.printSchema()

root
 |-- food: string (nullable = true)
 |-- price: string (nullable = true)



In [13]:
df.describe()

DataFrame[summary: string, food: string, price: string]

In [0]:
from pyspark.sql.types import StructField, IntegerType, StructType, StringType

In [19]:
schema = StructType([StructField("food", StringType(), True),
         StructField("price", IntegerType(), True)])
schema

StructType(List(StructField(food,StringType,true),StructField(price,IntegerType,true)))

In [21]:
df = spark.read.csv(SparkFiles.get("food.csv"), schema=schema, header=True)
df

DataFrame[food: string, price: int]

In [22]:
df.printSchema()

root
 |-- food: string (nullable = true)
 |-- price: integer (nullable = true)



In [23]:
df['Price']

Column<b'Price'>

In [24]:
type(df['price'])

pyspark.sql.column.Column

In [27]:
df.select('price').show()

+-----+
|price|
+-----+
|    0|
|   12|
|   10|
+-----+



In [28]:
df.withColumn('new price', df['price']).show()

+-------+-----+---------+
|   food|price|new price|
+-------+-----+---------+
|  pizza|    0|        0|
|  sushi|   12|       12|
|chinese|   10|       10|
+-------+-----+---------+



In [29]:
df.withColumnRenamed('price', 'new price').show()

+-------+---------+
|   food|new price|
+-------+---------+
|  pizza|        0|
|  sushi|       12|
|chinese|       10|
+-------+---------+



In [30]:
df.withColumn('doubleprice', df['price']*2).show()

+-------+-----+-----------+
|   food|price|doubleprice|
+-------+-----+-----------+
|  pizza|    0|          0|
|  sushi|   12|         24|
|chinese|   10|         20|
+-------+-----+-----------+



In [0]:
# collecting the column as a list


In [31]:
df.select("price").collect()

[Row(price=0), Row(price=12), Row(price=10)]

In [0]:
import pandas as pd

In [0]:
pandas_df = df.toPandas()

In [34]:
pandas_df

Unnamed: 0,food,price
0,pizza,0
1,sushi,12
2,chinese,10
