In [None]:
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql.functions import *

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from decimal import Decimal

In [None]:
spark = SparkSession.\
        builder.\
        config ("spark.sql.warehouse.dir","/user/hive/warehouse").\
        enableHiveSupport().\
        appName("Spark SQL - Data processing - Linear regression").\
        master("yarn").\
        getOrCreate()
       

In [None]:
spark = SparkSession.\
        builder.\
        config("spark.sql.warehouse.dir","/user/hive/warehouse").\
        enableHiveSupport().\
        appName("Exercise-01 | Get Monthly crime count by type ").\
        master("yarn").\
        getOrCreate()

In [None]:
spark

In [None]:
import getpass
username = getpass.getuser()

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "2")

In [None]:
spark.sql("""
create database hr_db
""")

In [None]:
spark.sql("""
show databases
""").show(truncate = False)

In [None]:
spark.sql("use hr_db")

In [None]:
spark.sql("select current_database()").show()

In [None]:
spark.sql("""
CREATE TABLE employees (
  employee_id     int,
  first_name      varchar(20),
  last_name       varchar(25),
  email           varchar(25),
  phone_number    varchar(20),
  hire_date       date,
  job_id          varchar(10),
  salary          decimal(8,2),
  commission_pct  decimal(2,2),
  manager_id      int,
  department_id   int
) ROW FORMAT 
    DELIMITED FIELDS TERMINATED BY '\t'
    
""")

In [None]:
spark.sql("""
Load data LOCAL inpath '/home/forgcpmak/hr_db/employees' INTO TABLE hr_db.employees
""")

In [None]:
spark.sql("""
select * from hr_db.employees
""").show(25,truncate = False)

In [None]:
spark.sql ("""
SELECT employee_id, 
       department_id, 
       salary,
       count(1) OVER (PARTITION BY department_id) AS employee_count,
       rank() OVER (ORDER BY salary DESC) AS rnk,
       lead(employee_id,1,0) OVER (PARTITION BY department_id ORDER BY salary desc)  lead_emp_id,
       lead(salary,1,0) OVER (PARTITION BY department_id ORDER BY salary desc) AS lead_emp_sal
FROM employees
ORDER BY employee_id
""").show(3,truncate = False)

In [None]:
spark.sql ("""
SELECT e.employee_id, e.department_id, e.salary,
       sum(e.salary) 
         OVER (PARTITION BY e.department_id)
         AS department_salary_expense
FROM employees e
ORDER BY e.department_id
""").show()

In [None]:
spark.sql("""
CREATE TABLE retail_db.daily_revenue
AS
SELECT o.order_date,
       round(sum(oi.order_item_subtotal), 2) AS revenue
FROM retail_db.orders o 

JOIN retail_db.order_items oi
ON o.order_id = oi.order_item_order_id

WHERE o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date
""")
          

In [None]:
spark.sql("""
drop table if exists retail_db.daily_revenue
""")

In [None]:
spark.sql("""
SELECT t.*,
  first_value(order_item_product_id) OVER (
    PARTITION BY order_date ORDER BY revenue DESC
  ) first_product_id,
  first_value(revenue) OVER (
    PARTITION BY order_date ORDER BY revenue DESC
  ) first_revenue
FROM daily_product_revenue t
ORDER BY order_date, revenue DESC
LIMIT 100
""")

In [None]:
help (spark.read.csv)

In [None]:
spark.read.csv?

In [None]:
spark. \
    read. \
    csv('/user/forgcpmak/retail_db/orders',
        header=False,
        schema='''
            order_id INT, 
            order_date STRING, 
            order_customer_id INT, 
            order_status STRING
        '''
       ). \ 
    show(25,truncate = False)

In [None]:
employee_schema = spark. \
       read. \
    csv('/user/forgcpmak/data/data/hr_db/employees',
        header= False,
        inferSchema=True
       ).\
    schema

In [None]:
type(employee_schema)

In [None]:
order_schema.show(truncate = False)

In [None]:
employeeDF = spark.read. \
     csv('/user/forgcpmak/data/data/hr_db/employees',
        schema=employee_schema,
        header=False
       )

In [None]:
employeeDF.show()

In [None]:
orders.printSchema()

In [None]:
from pyspark import SparkContext
SparkContext

In [None]:
help(spark.read.csv)

In [None]:
from pyspark.sql.functions import *

In [None]:
help(SparkContext.textFile)

In [None]:
employeeDF = spark.read.text("/user/forgcpmak/data/data/hr_db/employees")

In [None]:
crimeDf=spark.read.\
options(delimiter=';',inferSchema = True , header = True).\
csv("/user/forgcpmak/data/Crimes_-_2001_to_Present.csv")

In [None]:
crimeDf = crimeDf\
              .withColumnRenamed('Primary Type','PrimaryType')\
              .withColumnRenamed('FBI Code','FBICode')\
              .withColumnRenamed('X Coordinate','X_Coordinate')\
              .withColumnRenamed('Y Coordinate','Y_Coordinate')

crimeDf.printSchema()

In [None]:
#Important Learning here is - the format provided to to_date function - 'MM/dd/yyyy' , should match with data of date being read from the file.
crimeDf.\
      select('Date','PrimaryType').\
      withColumn('DateStringConvertedToDate' , to_date(substring(col('Date'),1,10),'MM/dd/yyyy')).\
      withColumn('Month' , date_format(to_date(substring(col('Date'),1,10),'MM/dd/yyyy'),'yyyyMM')).\
      show(2, truncate = False)

In [None]:
# While saving file with zip compressions , need to use th codec in options.
crimeDf.\
      withColumn('Month' , date_format(to_date(substring(col('Date'),1,10),'MM/dd/yyyy'),'yyyyMM')).\
      groupBy(col('Month'),col('PrimaryType')).\
      agg (
         count(lit(1)).cast('int').alias('CrimeCount')
).\
orderBy(col('Month').asc(),col(('CrimeCount')).desc()).\
repartition(1).\
write.\
format('csv').\
mode('overwrite').\
options(sep ='\\t', header= True, codec = "org.apache.hadoop.io.compress.GzipCodec").\
save("/user/forgcpmak/data/CrimeCountByMonth/CrimeCountByMonth")

In [None]:
#Saving file in ORC format 
crimeDf.\
      withColumn('Month' , date_format(to_date(substring(col('Date'),1,10),'MM/dd/yyyy'),'yyyyMM')).\
      groupBy(col('Month'),col('PrimaryType')).\
      agg (
         count(lit(1)).cast('int').alias('CrimeCount')
       ).\
      select ( col('Month'),col('PrimaryType').alias('PrimaryType'),col('CrimeCount') ).\
      orderBy(col('Month').asc(),col(('CrimeCount')).desc()).\
      repartition(1).\
      write.\
      mode('overwrite').\
      save("/user/forgcpmak/data/CrimeCountByMonthOrc",format="orc")


In [None]:
#Saving file in AVRO does not work in SPARK a it needs additional libraries
crimeDf.\
      withColumn('Month' , date_format(to_date(substring(col('Date'),1,10),'MM/dd/yyyy'),'yyyyMM')).\
      groupBy(col('Month'),col('Primary Type')).\
      agg (
         count(lit(1)).cast('int').alias('CrimeCount')
       ).\
      select ( col('Month'),col('Primary Type').alias('PrimaryType'),col('CrimeCount') ).\
      orderBy(col('Month').asc(),col(('CrimeCount')).desc()).\
      repartition(1).\
      write.\
      mode('overwrite').\
      save("/user/forgcpmak/data/CrimeCountByMonthAvro",format="avro")

In [None]:
#While saving file in specific format - ORC/AVRO - either use - method. parquet/orc or pass option as format='ORC'/'parquet' in save method
crimeDf.\
      withColumn('Month' , date_format(to_date(substring(col('Date'),1,10),'MM/dd/yyyy'),'yyyyMM')).\
      groupBy(col('Month'),col('Primary Type')).\
      agg (
         count(lit(1)).cast('int').alias('CrimeCount')
       ).\
      select ( col('Month'),col('Primary Type').alias('PrimaryType'),col('CrimeCount') ).\
      orderBy(col('Month').asc(),col(('CrimeCount')).desc()).\
      repartition(1).\
      write.\
      mode('overwrite').\
      parquet("/user/forgcpmak/data/CrimeCountByMonthParq")

In [None]:
#COnvert the Date value which is string in source file to date using to_date by matching the format in input file and then on that converted date apply the date format to 
#extract the month.
crimeCountByMonthDF = crimeDf.\
      withColumn('Month' , date_format(to_date(substring(col('Date'),1,10),'MM/dd/yyyy'),'yyyyMM')).\
      groupBy(col('Month'),col('PrimaryType')).\
      agg (
         count(lit(1)).alias('CrimeCount')
).\
orderBy(col('Month').asc())

In [None]:
crimeCountByMonthDF.printSchema()

In [None]:
## Demo on how to use RANK function.

#Step #1 - Import the Window 
from pyspark.sql.window import Window
#Stpe #2 - Import the rank function
from pyspark.sql.functions import rank

#Step #3 - Create the window spec. 
#In this example - aim is to partition by PrimaryType  and order byy CrimeCount and then list in descending order.
#SO that records with rank = 1 can give the max of given PrimaryType.
windowSpec  = Window.partitionBy("PrimaryType").orderBy(col("CrimeCount").desc())

#Step #4 - Apply the window spec in using rank() funtion.
crimeCountByMonthDF.\
                  withColumn("rank",rank().over(windowSpec)).\
                  filter( (crimeCountByMonthDF['PrimaryType'] == 'PROSTITUTION') ).\
                  filter(col('rank') == 1 ).\
                  show()


In [None]:
highestCrimeMonthsCategoryWiseDf = crimeCountByMonthDF.\
                  withColumn("rank",rank().over(windowSpec)).\
                  filter(col('rank') == 1 ).\
                  drop('rank').\
                  orderBy(col('CrimeCount').desc())

In [None]:
crimeDf.\
       withColumn ( 'CrimeMmonth' , date_format('Date','yyyyMM') ).\
       select('Date','Primary Type','CrimeMmonth').\
       show(2, truncate = False)

In [None]:
highestCrimeMonthsCategoryWiseDf.printSchema()

In [None]:
#Reading tab delimited file.
df3 = spark.read.options(delimiter='\\t') \
     .schema ("""
                Id int , \
                FirstName String , \
                LastName String, \
                ShortName String, \
                MobileNumber String ,\
                DateofBirth Date, \
                Department string,
                Salary Double,
                HikePct Double,
                ManagerId Int,
                DepatMentId Int
             """)\
     .csv("/user/forgcpmak/data/data/hr_db/employees")

In [None]:
df4 = spark.read.options(delimiter='\\t',inferSchema = True) \
     .csv("/user/forgcpmak/data/data/hr_db/employees")

In [None]:
df3.printSchema()

In [None]:
df3.filter ("ManagerId is null").show()

In [None]:
df3.filter (df3['ManagerId'] == 'null').show()

In [None]:
df3.withColumn ( "BirthDayName",date_format(col("DateofBirth"),'EEEE') )\
   .filter("( date_format(DateofBirth,'EEEE') = 'Sunday')").show()

In [None]:
df3.select(countDistinct(df3['Department'])).show()

In [None]:
df3.filter("FirstName like 'D%'").show()

In [None]:
df3.filter( col("FirstName").like("D%")).show()

In [None]:
df3.groupBy(df3['Department']).count().sort(col('Department'))

In [None]:
df3.filter('HikePct is not null').show()

In [None]:
df3.filter( (df3['HikePct'].isNotNull()) & \
             (df3['HikePct'] > 0.0)\
            ).\
          groupBy(df3['Department'],df3['HikePct']).\
          agg( 
               count(lit(1)).alias("TotalCount"),
               round(avg(df3['HikePct']),2).alias("AvgHikePct")
               ).\
          sort(col('AvgHikePct').desc()).\
          show()

In [None]:
df3.filter( (df3['HikePct'].isNotNull()) & \
             (df3['HikePct'] > 0.2)\
            ).\
          rollup(df3['Department'],df3['HikePct']).\
          agg( 
               count(lit(1)).alias("TotalCount"),
               round(avg(df3['HikePct']),2).alias("AvgHikePct")
               ).\
          sort(col('Department').asc(),col('HikePct').asc()).\
          show()

In [None]:
df3. \
    select(
            (
              ( sum (
                      coalesce(
                                col('HikePct').cast('int'), lit(0)
                             )
                          * col('Salary')
                     )
              ) / lit(100)
            ).alias('total_bonus')
           ). \
    show()

In [None]:
df3. \
    selectExpr('sum((coalesce(cast(HikePct AS INT), 0) * Salary) / 100) AS total_bonus'). \
    show()

In [None]:
df3. \
    select('Department', 'Salary', 'HikePct'). \
    describe(). \
    show()

In [None]:
df3. \
    select('Department', 'Salary', 'HikePct'). \
    summary(). \
    show()

In [None]:
from pyspark.sql.functions import countDistinct

In [None]:
from pyspark.sql.functions import col

In [None]:
help(df3)

In [None]:
help(eomployeeDFSplit)

In [None]:
c = col('x')

In [None]:
help(c)

In [None]:
ordersDf = spark.read.json('/user/forgcpmak/data/data/retail_db_json/orders')

In [None]:
ordersDf.show(10, truncate = False)

In [None]:
ordersDf.printSchema()

In [None]:
ordersDf.count()

In [None]:
ordersDf.\
         groupBy(col('order_date') ).\
         agg (
                count(lit(1)).alias('order_count_by_date')
).sort(col('order_date').asc()).\
show()

In [None]:
ordersDf.\
         cube(col('order_date')).\
         agg (
                count(lit(1)).alias('order_count_by_date')
).orderBy(col('order_date').asc()).\
show(truncate = False)

In [None]:
ordersDf.\
         cube(col('order_date')).\
         agg (
                count(lit(1)).alias('order_count_by_date')
).sort(col('order_date').asc()).\
count()

In [None]:
ordersDf. \
    groupBy(
        year('order_date').alias('order_year'),
        date_format('order_date', 'yyyyMM').alias('order_month'), 
        'order_date'
    ). \
    agg(count(lit(1)).alias('order_count')). \
    orderBy('order_year', 'order_month', 'order_date'). \
    show(truncate = False)

In [None]:
ordersDf.show()

In [None]:
ordersDf. \
    groupBy(
        year('order_date').alias('order_year'),
        date_format('order_date', 'yyyyMM').alias('order_month'), 
        'order_date'
    ). \
    agg( 
         count(lit(1)).alias('order_count') ,
         sum( when(col('order_status') == 'CLOSED',1).otherwise(lit(0))).alias("ClosedOrderCount"),
         sum( when(col('order_status') == 'COMPLETE',1).otherwise(lit(0))).alias("CompletedOrderCount") 
         ). \
        show()

In [None]:
order_items_df = spark.read.json('/user/forgcpmak/data/data/retail_db_json/order_items')

In [None]:
customers = spark.read.json('/user/forgcpmak/data/data/retail_db_json/customers')

In [None]:
customers.count()

In [None]:
customers. \
    alias('c'). \
    join(
        ordersDf.alias('o'), 
        on=customers['customer_id'] == ordersDf['order_customer_id'],
        how='left'
    ). \
    filter('o.order_id IS NULL'). \
    selectExpr('c.customer_id', 'c.customer_email', 'o.*'). \
    show()

In [None]:
customers. \
    alias('c'). \
    join(
        ordersDf.alias('o'), 
        on=customers['customer_id'].eqNullSafe(ordersDf['order_customer_id']),
        how='left'
    ). \
    select (customers['customer_id'],ordersDf['order_id'])\
    .groupBy(customers['customer_id'])\
    .agg (
           sum( when (col('order_id').isNull(),0).otherwise(lit(1))).alias('order_count')
         )\
    .orderBy(col('order_count').desc())\
    .filter (col('order_count') == 0)\
    .show()
    

In [None]:
Get the number of orders placed by each customer for the year 2013. If a customer have not placed any order get the order count for the customer as 0



In [None]:
customers.printSchema()

In [None]:
#Comparing equal column - in null safe manner using. - eqNullSafe 
ordersDf. \
    join(
        order_items_df, 
        on=ordersDf['order_id'].eqNullSafe(order_items_df['order_item_order_id']),
        how='inner'
    ). \
    select(ordersDf['order_id'], ordersDf['order_date'], ordersDf['order_status'], order_items_df['order_item_subtotal']). \
    filter ( ordersDf ['order_status'] == 'CLOSED' ).\
    show()

In [None]:
ordersDf. \
    join(
        order_items_df, 
        on=ordersDf['order_id'] == order_items_df['order_item_order_id'],
        how='inner'
    ). \
    select(ordersDf['*'],order_items_df['*']). \
    filter ( ordersDf ['order_status'] == 'CLOSED' ).\
    show()

In [None]:
order_items_df.\
               groupBy(order_items_df['order_item_product_id']).\
               agg (
                   count(lit(1)).alias("NumberOfTimeOrdered")  
).\
orderBy(col('NumberOfTimeOrdered').desc()).\
show()


In [None]:
ordersDf. \
    filter("order_status IN ('COMPLETE', 'CLOSED')")

In [None]:
ordersDf.\
         filter( ordersDf['order_status'].isin('COMPLETE', 'CLOSED') )

In [None]:
ordersDf.filter(("order_status" == 'COMPLETE') || ("order_status" == 'CLOSED'))

In [None]:
sc = spark.sparkContext

squaresDF = spark.createDataFrame(sc.parallelize(range(1, 6))
                                  .map(lambda i: Row(single=i, double=i ** 2)))
squaresDF.write.parquet("data/test_table/key=1")

# Create another DataFrame in a new partition directory,
# adding a new column and dropping an existing column
cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11))
                                .map(lambda i: Row(single=i, triple=i ** 3)))
cubesDF.write.parquet("data/test_table/key=2")

# Read the partitioned table
mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
mergedDF.printSchema()


In [None]:
sc = spark.sparkContext
from pyspark.sql import Row

In [None]:
squaresDF = spark.createDataFrame(sc.parallelize(range(1, 6))
                                  .map(lambda i: Row(single=i, double=i ** 2)))

In [None]:
squaresDF.write.mode('overwrite').save("/user/forgcpmak/data/test_table/key=1",format = 'parquet')

In [None]:
cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11))
                                .map(lambda i: Row(single=i, triple=i ** 3)))

In [None]:
squaresDF.show()

In [None]:
cubesDF.write.mode('overwrite').parquet("/user/forgcpmak/data/test_table/key=2")

In [None]:
mergedDF = spark.read.option("mergeSchema", "true").parquet("/user/forgcpmak/data/test_table")

In [None]:
mergedDF.printSchema()

In [None]:
mergedDF.show()

In [None]:
file_location = "/user/forgcpmak/data/sales_info.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [None]:
df.printSchema()

In [None]:
df\
   .groupBy(df['Person'])\
   .agg(sum('Sales').alias('SumSalesByCompany'))\
   .show()

In [None]:
windowSpec  = Window.partitionBy("Person").orderBy(col("Company").desc())
df.\
  withColumn("rank",rank().over(windowSpec)).\
  filter(col('rank') == 1 ).\
  show()

In [None]:
file_location = "/user/forgcpmak/data/appl_stock.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_appl_stock = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [None]:
df_appl_stock.show()

In [None]:
df_appl_stock.describe().select('summary',format_number(col('open').cast('int'),2).alias('Open'),'high').show()

In [None]:
from pyspark.sql.functions import format_number

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, DecimalType
from decimal import Decimal

In [None]:
walmart_stock_schema = StructType([
                            StructField("Date",DateType(),True)
                           ,StructField("Open",DoubleType(),True)
                           ,StructField("High",DoubleType(),True)
                           ,StructField("Low",DoubleType(),True)
                           ,StructField("Close",DoubleType(),True)
                           ,StructField("Volume",LongType(),True)
                           ,StructField("AdjClose",DoubleType(),True)
                         ])

In [None]:

file_location = "/user/forgcpmak/data/walmart_stock.csv"
file_type = "csv"

# CSV options
infer_schema = "False"
first_row_is_header = "True"
delimiter = ","


# The applied options are for CSV files. For other file types, these will be ignored.
walmart_stock_df = spark.read.format(file_type) \
  .schema(walmart_stock_schema)\
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [None]:
walmart_stock_df.printSchema()
walmart_stock_df.show()

In [None]:
# Convert list to RDD
# RDD trasnformation is not requird.

rddWalmartHigh = spark.sparkContext.parallelize(walmart_stock_df.orderBy(col('High').desc()).head(2))
rddWalmartHigh.take(1)
#Pass directly the list to createDataFrame method.
walmartHighDf = spark.createDataFrame(spark.sparkContext.parallelize(walmart_stock_df.orderBy(col('High').desc()).head(2)),walmart_stock_schema)
walmartHighDf.show()

In [None]:
# Use schema from existing DF.
existingDFSchema = StructType.fromJson (walmartHighDf.schema.jsonValue())

existingDFSchema = walmartHighDf.schema
# use this schema now to read new DF.
walmart_stock_read_from_df1 = spark.read.format(file_type) \
  .schema(existingDFSchema)\
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [None]:
walmart_stock_read_from_df1.printSchema()

In [None]:
# Write the schema
with open("/home/forgcpmak/schema.json", "w") as f:
    json.dump(walmartHighDf.schema.jsonValue(), f)

# Read the schema
with open("/home/forgcpmak/schema.json") as f:
    new_schema = StructType.fromJson(json.load(f))
    print(new_schema.simpleString())
    display(new_schema)

In [None]:
help(open)

In [None]:
walmart_stock_df\
  .groupBy(col('Date'))\
  .agg(
       format_number(max(col('Open')),2).alias('MaxOpenPrice')
       )\
   .show()

In [None]:
appl_stock_window  = Window.partitionBy("Date").orderBy(col("Open").desc())
walmart_stock_df.\
  select ('Date','Open').\
  withColumn("rank",rank().over(appl_stock_window)).\
  filter(col('rank') == 1 ).\
  show()

In [None]:
max_open_price= walmart_stock_df.agg({'open':'max'})

In [None]:
max_open_price = max_open_price.withColumnRenamed('max(open)','MaxOpen')

In [None]:
import json

# Define the schema
schema = StructType(
    [StructField("name", StringType(), True), StructField("age", IntegerType(), True)]
)

# Write the schema
with open("/home/forgcpmak/schema.json", "w") as f:
    json.dump(schema.jsonValue(), f)

# Read the schema
with open("/home/forgcpmak/schema.json") as f:
    new_schema = StructType.fromJson(json.load(f))
    print(new_schema.simpleString())

In [None]:
display(new_schema)
   

In [None]:
# Read the schema
with open("/home/forgcpmak/schema.json") as f:
    new_schema = StructType.fromJson(json.load(f))
    print(new_schema.simpleString())

In [None]:
# List
data = [('Category A', 100, "This is category A"),
        ('Category B', 120, "This is category B"),
        ('Category C', 150, "This is category C")]

# Create a schema for the dataframe
schema = StructType([
    StructField('Category', StringType(), True),
    StructField('Count', IntegerType(), True),
    StructField('Description', StringType(), True)
])

columns = ["Category", "Count" , "Desc" ]
# Convert list to RDD
rdd = spark.sparkContext.parallelize(data)

# Create data frame
df = spark.createDataFrame(data,columns)
print(df.schema)
df.show()

In [None]:
##Readind fixed width file 
#Based on pred defined columns positions 

#Step1 - defind the column name / data co-ordinates / data type in a list of tupesl.
schema = [
          ("id",1,5,"int"),
          ("ssn",6,10,"string"),
          ("name",16,4,"string")
]
          

#Step#2. - Read the source file 
df = spark.read.text("/user/forgcpmak/data/personInfo.txt")
df.show()

#Steps#3 - from source DF. , iterate over the schema list and re-attach the each separation in main DF.
df2 = df
#This loop will work on eveloving DF2 in an iterative manner for each column defined in schema list.
for colinfo in schema:
  df2 = df2.withColumn(colinfo[0], df2.value.substr(colinfo[1],colinfo[2]).cast(colinfo[3]))
  # this print will demonstrate how the schema is evolving.
  df2.show()

df2 = df2.drop('value')
df2.show()


In [7]:
## Below is code of Machine learning linear regressions

# Read file. : /user/forgcpmak/data/josportillacource/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv

file_location = "/user/forgcpmak/data/josportillacource/Spark_for_Machine_Learning/Linear_Regression/cruise_ship_info.csv"
file_type = "csv"

# CSV options
infer_schema = "True"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
cruise_ship_info_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

In [8]:
cruise_ship_info_df.count()

158

In [9]:
cruise_ship_info_df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [10]:
cruise_ship_info_df.show(5, truncate = False)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|Ship_name  |Cruise_line|Age|Tonnage           |passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|Journey    |Azamara    |6  |30.276999999999997|6.94      |5.94  |3.55  |42.64            |3.55|
|Quest      |Azamara    |6  |30.276999999999997|6.94      |5.94  |3.55  |42.64            |3.55|
|Celebration|Carnival   |26 |47.262            |14.86     |7.22  |7.43  |31.8             |6.7 |
|Conquest   |Carnival   |11 |110.0             |29.74     |9.53  |14.88 |36.99            |19.1|
|Destiny    |Carnival   |17 |101.353           |26.42     |8.92  |13.21 |38.36            |10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [45]:
cruise_ship_info_df\
           .groupBy('Cruise_line')\
           .agg(
             count(lit(1)).alias('ShipCountByrCruiseLine')
             )\
            .orderBy(col('ShipCountByrCruiseLine').desc())\
            .show(5)
           
                 

+----------------+----------------------+
|     Cruise_line|ShipCountByrCruiseLine|
+----------------+----------------------+
| Royal_Caribbean|                    23|
|        Carnival|                    22|
|        Princess|                    17|
|Holland_American|                    14|
|       Norwegian|                    13|
+----------------+----------------------+
only showing top 5 rows



In [16]:
from pyspark.ml.feature import StringIndexer
## Required to install numpy

In [18]:
indexer = StringIndexer(inputCol = 'Cruise_line',outputCol = 'cruise_cat')
indexed = indexer.fit(cruise_ship_info_df).transform(cruise_ship_info_df)
indexed.head(5)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),
 Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, cruise_cat=1.0),
 Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1, cruise_cat=1.0),
 Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0, cruise_cat=1.0)]

In [21]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [22]:
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat']

In [23]:
assembler = VectorAssembler(inputCols = ['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_cat'], outputCol = 'features')

In [24]:
output = assembler.transform(indexed)

In [27]:
output.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_cat',
 'features']

In [26]:
output.select('features','crew').show(truncate = False)

+--------------------------------------------------+----+
|features                                          |crew|
+--------------------------------------------------+----+
|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0]|3.55|
|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0]|3.55|
|[26.0,47.262,14.86,7.22,7.43,31.8,1.0]            |6.7 |
|[11.0,110.0,29.74,9.53,14.88,36.99,1.0]           |19.1|
|[17.0,101.353,26.42,8.92,13.21,38.36,1.0]         |10.0|
|[22.0,70.367,20.52,8.55,10.2,34.29,1.0]           |9.2 |
|[15.0,70.367,20.52,8.55,10.2,34.29,1.0]           |9.2 |
|[23.0,70.367,20.56,8.55,10.22,34.23,1.0]          |9.2 |
|[19.0,70.367,20.52,8.55,10.2,34.29,1.0]           |9.2 |
|[6.0,110.23899999999999,37.0,9.51,14.87,29.79,1.0]|11.5|
|[10.0,110.0,29.74,9.51,14.87,36.99,1.0]           |11.6|
|[28.0,46.052,14.52,7.27,7.26,31.72,1.0]           |6.6 |
|[18.0,70.367,20.52,8.55,10.2,34.29,1.0]           |9.2 |
|[17.0,70.367,20.52,8.55,10.2,34.29,1.0]           |9.2 |
|[11.0,86.0,21

In [28]:
final_data = output.select('features','crew')

In [30]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [32]:
train_data.count()

120

In [33]:
test_data.count()

38

In [34]:
## build regression model
from pyspark.ml.regression import LinearRegression

In [36]:
ship_lr = LinearRegression(labelCol ='crew')

In [37]:
trained_ship_model = ship_lr.fit(train_data)

In [38]:
ship_results = trained_ship_model.evaluate(test_data)

In [39]:
ship_results.rootMeanSquaredError

0.65589385972916

In [40]:
ship_results.r2

0.9632693662297822

In [41]:
ship_results.meanAbsoluteError

0.49946735640322437

In [42]:
from pyspark.sql.functions import corr

In [43]:
## find coorelation - if cre size is related to passenger count
cruise_ship_info_df.select(corr('crew','passengers'))

"corr(crew, passengers)"
0.9152341306065384


In [None]:
# Linear Agression model steps 
#Step :1 