<a href="https://colab.research.google.com/github/sabaripkumar/digipen/blob/main/CET3052_Spark_Simple_ex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sales Analysis

This code takes a Sales dataset, analyze it according to countries. The code tries to demo how to use PySpark.

In [None]:
!pip install -q pyspark

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


## DataFrame and Frequency Counter

In [None]:
!wget https://raw.githubusercontent.com/annesjyu/dataengr2023/main/Sales.csv

--2024-08-07 13:24:55--  https://raw.githubusercontent.com/annesjyu/dataengr2023/main/Sales.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65773 (64K) [text/plain]
Saving to: ‘Sales.csv.1’


2024-08-07 13:24:55 (4.35 MB/s) - ‘Sales.csv.1’ saved [65773/65773]



In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Sales Analysis").getOrCreate()

# Set the truncate option to False to prevent column truncation
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 0)

In [None]:
# Read csv data into a DataFrame
df = spark.read.csv("Sales.csv", header=True, inferSchema=True)

In [None]:
df.show()

+----------------+--------+-----+------------+---------------+------------+----------------+--------------+----------------+---------------+-----------+----------+
|Transaction_date| Product|Price|Payment_Type|           Name|        City|           State|       Country| Account_Created|     Last_Login|   Latitude| Longitude|
+----------------+--------+-----+------------+---------------+------------+----------------+--------------+----------------+---------------+-----------+----------+
|   1/2/2009 6:17|Product1| 1200|  Mastercard|       carolina|    Basildon|         England|United Kingdom|   1/2/2009 6:00|  1/2/2009 6:08|       51.5|-1.1166667|
|  1/3/2009 14:44|Product1| 1200|        Visa|          Gouya|      Echuca|        Victoria|     Australia| 9/25/2005 21:13| 1/3/2009 14:22|-36.1333333|    144.75|
|  1/4/2009 13:17|Product1| 1200|  Mastercard|Renee Elisabeth|    Tel Aviv|        Tel Aviv|        Israel|  1/4/2009 13:03| 1/4/2009 22:10| 32.0666667|34.7666667|
|  1/4/2009 14:1

In [None]:
df.summary().show()

+-------+----------------+--------+------------------+------------+-----+------+------+--------------+---------------+---------------+------------------+-----------------+
|summary|Transaction_date| Product|             Price|Payment_Type| Name|  City| State|       Country|Account_Created|     Last_Login|          Latitude|        Longitude|
+-------+----------------+--------+------------------+------------+-----+------+------+--------------+---------------+---------------+------------------+-----------------+
|  count|             535|     535|               535|         535|  535|   535|   534|           535|            535|            535|               535|              535|
|   mean|            NULL|    NULL|1645.7943925233644|        NULL| NULL|  NULL|  NULL|          NULL|           NULL|           NULL| 40.31821572149535|3.421527447476634|
| stddev|            NULL|    NULL| 1093.107481054373|        NULL| NULL|  NULL|  NULL|          NULL|           NULL|           NULL|25.992

In [None]:
type(df)

In [None]:
# Count the frequency of each country
country_frequency = df.groupBy("Country").count()

# Show the result
country_frequency.show(n=10)

+-----------+-----+
|    Country|count|
+-----------+-----+
|     Russia|    1|
|     Sweden|   13|
|     Jersey|    1|
|Philippines|    2|
|   Malaysia|    1|
|     Turkey|    6|
|    Germany|   25|
|     France|   27|
|     Greece|    1|
|  Argentina|    1|
+-----------+-----+
only showing top 10 rows



# Schema and DataFrame

In [None]:
from pyspark.sql import SparkSession

# Define schema for our data using DDL
schema = "Id INT, First STRING, Last STRING, Url STRING, Published STRING, Hits INT, Campaigns ARRAY<STRING>"

# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter","LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web","twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web","twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]]

# Create a SparkSession
spark = (SparkSession.builder.appName("schema_example").getOrCreate())

# Create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, schema)

# Show the DataFrame; it should reflect our table above
blogs_df.show()

# Print the schema used by Spark to process the DataFrame
print(blogs_df.printSchema())

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (

In [None]:
print(type(blogs_df))

<class 'pyspark.sql.dataframe.DataFrame'>


## Rows

In [None]:
from pyspark.sql import Row

In [None]:
row1 = Row("Matei Zaharia", "CA")
row2 = Row("Reynold Xin", "CA")
rows = [row1, row2]

print(type(row1))

authors_df = spark.createDataFrame(rows, ["Authors", "State"])

print(type(authors_df))

print(authors_df.show())

<class 'pyspark.sql.types.Row'>
<class 'pyspark.sql.dataframe.DataFrame'>
+-------------+-----+
|      Authors|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+

None
