In [0]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *

#-----------------------------------#

sc = SparkContext.getOrCreate()
spark =  SparkSession(sc)


In [0]:
from pyspark.sql.types import StructType,StructField,StringType

schema = StructType([
    StructField("language", StringType(), True),
    StructField("year", StringType(), True),
    StructField("users", StringType(), True),
  StructField("admin", StringType(), True),
  StructField("country", StringType(), True)
  
])

data  = (('java','2020','222222','somesh','india'),('python','2222','222222','ram','usa'),('Scala','2021','222222','sol','germany'))

df  = spark.createDataFrame(data,schema)
df.show()

In [0]:
cols = ['admin','country']
df= df.drop(*cols)
df.show()

In [0]:
df2 = df.withColumn("year", when(col("year") == "2020","PAST")
      .when(col("year") == "2222","Future")
      .otherwise("Current"))

In [0]:
df2.show()

In [0]:
df3 = df.select(col("*"),when(col("year") == "2020","PAST")
      .when(col("year") == "2222","Future")
      .otherwise("Current").alias("New"))

df3.show()


In [0]:
df1 = df.withColumn('new_col',expr("case when year = '2020' then 'PAST'"+
                              "when year = 2021 then 'Current'"+
                                  "else 'Future' end"))

df1.show()

In [0]:
df4 = df.select(col("*"),expr("case when year = '2020' then 'PAST'"+
                              "when year = 2021 then 'Current'"+
                                  "else 'Future' end").alias('new_col'))

df4.show()

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
data= ((66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4"))



schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("code", StringType(), True),
    StructField("amt", StringType(), True)])
dataDF = spark.createDataFrame(data,schema)
dataDF.show()

In [0]:
ddf = dataDF.withColumn("new_column",
       when((col("code") == "a") | (col("code") == "d"), "A")
      .when((col("code") == "b") & (col("amt") == "4"), "B")
      .otherwise("A1"))


ddf.show()

In [0]:
from pyspark.sql import Row
l = [('Alice',2),('somesh',4),('soh',54),('raja',6),('bob',8),('cate',9)]
Person = Row('name','age')
rdd = sc.parallelize(l)
person = rdd.map(lambda r:Person(*r))
df2 = sqlContext.createDataFrame(person)
df2.show()

In [0]:
from pyspark.sql.types import * 

rdd = sc.parallelize(l)
schema =  StructType([StructField ("name" , StringType(), True) , 
StructField("age" , IntegerType(), True)]) 
df3 = sqlContext.createDataFrame(rdd, schema) 
df3.show()

In [0]:
df = df3.select(col("name"),col("age"),lit("1").alias("literals"))
df.show()

In [0]:
df = df3.withColumn("lit_value2",
    when((col("Age") >=2 )&(col("age") <= 8), 
       lit("8"))
      .otherwise(lit("9")))
df.show()

In [0]:
StringType	ShortType
ArrayType	IntegerType
MapType	LongType
StructType	FloatType
DateType	DoubleType
TimestampType	DecimalType
BooleanType	ByteType
CalendarIntervalType	HiveStringType
BinaryType	ObjectType
NumericType	NullType

In [0]:
simpleData = (("James",34,"2006-01-01","true","M",3000.60),
    ("Michael",33,"1980-01-10","true","F",3300.80),
    ("Robert",37,"06-01-1992","false","M",5000.50)
  )

simpleSchema = StructType((
    StructField("firstName",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("jobStartDate",StringType(),True),
    StructField("isGraduated", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", DoubleType(), True)
  ))

df = spark.createDataFrame(data = simpleData,schema = simpleSchema)
df.printSchema()
df.show()

In [0]:
df2 = df.withColumn("age",col("age").cast(StringType())).withColumn("isGraduated",col("isGraduated").cast(BooleanType())) .withColumn("jobStartDate",col("jobStartDate").cast(DateType()))
df2.printSchema()

In [0]:
df3 = df2.selectExpr("cast(age as int) age",
    "cast(isGraduated as string) isGraduated",
    "cast(jobStartDate as string) jobStartDate")
df3.printSchema()
df3.show()

In [0]:
df3.createOrReplaceTempView("CastExample")
df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated), \
        DATE(jobStartDate) from CastExample")
df4.printSchema()
df4.show()

In [0]:
data = (("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"),
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"),
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico"))

simpleSchema = StructType((
    StructField("Product",StringType(),True),
    StructField("Amount",IntegerType(),True),
    StructField("Country",StringType(),True)))

df = spark.createDataFrame(data,simpleSchema)
df.show()

In [0]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.show()

In [0]:
unPivotDF = pivotDF.select("Product",
expr("stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)")) \
.where("Total is not null")
unPivotDF.show()

In [0]:
simpleData = (("James","Sales",3000),
      ("Michael","Sales",4600),
      ("Robert","Sales",4100),
      ("Maria","Finance",3000),
      ("Raman","Finance",3000),
      ("Scott","Finance",3300),
      ("Jen","Finance",3900),
      ("Jeff","Marketing",3000),
      ("Kumar","Marketing",2000)
    )

simpleSchema = StructType((
    StructField("Name",StringType(),True),
    StructField("Dept",StringType(),True),
    StructField("Salary",StringType(),True)))

df = spark.createDataFrame(data = simpleData,schema  = simpleSchema)
df.show()

In [0]:
from pyspark.sql import Window
w2 = Window.partitionBy("dept").orderBy(col("salary"))
df.withColumn("row",row_number().over(w2)) \
      .where(col("row") == '1').drop("row") \
      .show()

In [0]:
w3 = Window.partitionBy("dept").orderBy(col("salary").desc())
df.withColumn("row",row_number().over(w3)) \
      .where(col("row") == '1').drop("row").show()

In [0]:
w4 = Window.partitionBy("dept")
aggDF = df.withColumn("row",row_number().over(w3)).withColumn("avg", avg(col("salary")).over(w4)).withColumn("sum", sum(col("salary")).over(w4)).withColumn("min", min(col("salary")).over(w4)).withColumn("max", max(col("salary")).over(w4)) .where(col("row")==1).select("dept","avg","sum","min","max").show()