In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"



In [None]:
import findspark
findspark.init()


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Simple PySpark DataFrame") \
    .getOrCreate()

# Create some data
data = [
    Row(id=1, name='Harsh', age=30),
    Row(id=2, name='Yash', age=25),
    Row(id=3, name='Janit', age=35)
]

# Create a DataFrame from the data
df = spark.createDataFrame(data)

# Show the DataFrame
df.show()


+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Harsh| 30|
|  2| Yash| 25|
|  3|Janit| 35|
+---+-----+---+



In [None]:
from pyspark.sql.functions import col
df = df.withColumn("age", col("age").cast("string"))


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Read CSV") \
    .getOrCreate()
df = spark.read.csv("/content/sample_data/california_housing_test.csv", header=True, inferSchema=False)

In [None]:
df.show(20)

+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population| households|median_income|median_house_value|
+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000| 606.000000|     6.608500|     344700.000000|
|-118.300000|34.260000|         43.000000|1510.000000|    310.000000| 809.000000| 277.000000|     3.599000|     176500.000000|
|-117.810000|33.780000|         27.000000|3589.000000|    507.000000|1484.000000| 495.000000|     5.793400|     270500.000000|
|-118.360000|33.820000|         28.000000|  67.000000|     15.000000|  49.000000|  11.000000|     6.135900|     330000.000000|
|-119.670000|36.330000|         19.000000|1241.000000|    244.000000| 850.000000| 237.000000|     2.937500|    

In [None]:
df.printSchema()

root
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- housing_median_age: string (nullable = true)
 |-- total_rooms: string (nullable = true)
 |-- total_bedrooms: string (nullable = true)
 |-- population: string (nullable = true)
 |-- households: string (nullable = true)
 |-- median_income: string (nullable = true)
 |-- median_house_value: string (nullable = true)



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a SparkSession
spark = SparkSession.builder \
    .appName("CreateDataFrameExample") \
    .getOrCreate()

# Define schema for the DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

# Sample data
data = [(1, "Harsh", 30),
        (2, "Yash", 25),
        (3, "Janit", 35)]

# Create a DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()


+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Harsh| 30|
|  2| Yash| 25|
|  3|Janit| 35|
+---+-----+---+



In [None]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [None]:
from pyspark.sql.functions import col
df = df.withColumn("age", col("age").cast("string"))


In [None]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)



In [None]:
sc=spark.sparkContext
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

print(rdd.collect())

[1, 2, 3, 4, 5]


In [None]:
df2 = spark.read.parquet("/content/userdata1.parquet")

In [None]:
df2.show(5)

+-------------------+---+----------+---------+--------------------+------+--------------+----------------+------------+---------+---------+--------------------+--------+
|  registration_dttm| id|first_name|last_name|               email|gender|    ip_address|              cc|     country|birthdate|   salary|               title|comments|
+-------------------+---+----------+---------+--------------------+------+--------------+----------------+------------+---------+---------+--------------------+--------+
|2016-02-03 07:55:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|   1.197.201.2|6759521864920116|   Indonesia| 3/8/1971| 49756.53|    Internal Auditor|   1E+02|
|2016-02-03 17:04:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male|218.111.175.34|                |      Canada|1/16/1968|150280.17|       Accountant IV|        |
|2016-02-03 01:09:31|  3|    Evelyn|   Morgan|emorgan2@altervis...|Female|  7.161.136.94|6767119071901597|      Russia| 2/1/1960|144972.51| Structural

In [None]:
df2.printSchema()

root
 |-- registration_dttm: timestamp (nullable = true)
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- ip_address: string (nullable = true)
 |-- cc: string (nullable = true)
 |-- country: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- title: string (nullable = true)
 |-- comments: string (nullable = true)



In [None]:
#read the json file by taking the multiline=true
df3=spark.read.json("/content/sample_data/anscombe.json")


In [None]:
!cat /content/example_1.json

cat: /content/example_1.json: No such file or directory


In [None]:
df3.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [None]:
df3 = df3.filter(df3["_corrupt_record"].isNull())

In [None]:
#read the json file by taking the multiline=true

df4=spark.read.json("/content/sample_data/anscombe.json")

!cat /content/sample_data/anscombe.json

[
  {"Series":"I", "X":10.0, "Y":8.04},
  {"Series":"I", "X":8.0, "Y":6.95},
  {"Series":"I", "X":13.0, "Y":7.58},
  {"Series":"I", "X":9.0, "Y":8.81},
  {"Series":"I", "X":11.0, "Y":8.33},
  {"Series":"I", "X":14.0, "Y":9.96},
  {"Series":"I", "X":6.0, "Y":7.24},
  {"Series":"I", "X":4.0, "Y":4.26},
  {"Series":"I", "X":12.0, "Y":10.84},
  {"Series":"I", "X":7.0, "Y":4.81},
  {"Series":"I", "X":5.0, "Y":5.68},

  {"Series":"II", "X":10.0, "Y":9.14},
  {"Series":"II", "X":8.0, "Y":8.14},
  {"Series":"II", "X":13.0, "Y":8.74},
  {"Series":"II", "X":9.0, "Y":8.77},
  {"Series":"II", "X":11.0, "Y":9.26},
  {"Series":"II", "X":14.0, "Y":8.10},
  {"Series":"II", "X":6.0, "Y":6.13},
  {"Series":"II", "X":4.0, "Y":3.10},
  {"Series":"II", "X":12.0, "Y":9.13},
  {"Series":"II", "X":7.0, "Y":7.26},
  {"Series":"II", "X":5.0, "Y":4.74},

  {"Series":"III", "X":10.0, "Y":7.46},
  {"Series":"III", "X":8.0, "Y":6.77},
  {"Series":"III", "X":13.0, "Y":12.74},
  {"Series":"III", "X":9.0, "Y":7.11},
 

As you can see, when multiline=false, PySpark reads the entire file as a single JSON object and stores it in a column named _corrupt_record, because the data is not formatted correctly for this setting. This is why it's important to choose the appropriate value for the multiline option based on the structure of your JSON data.

In [None]:
df5 = spark.read.option("multiline", "false").json("/content/sample_data/anscombe.json")
df5.show()

+------+----+-----+---------------+
|Series|   X|    Y|_corrupt_record|
+------+----+-----+---------------+
|  null|null| null|              [|
|     I|10.0| 8.04|           null|
|     I| 8.0| 6.95|           null|
|     I|13.0| 7.58|           null|
|     I| 9.0| 8.81|           null|
|     I|11.0| 8.33|           null|
|     I|14.0| 9.96|           null|
|     I| 6.0| 7.24|           null|
|     I| 4.0| 4.26|           null|
|     I|12.0|10.84|           null|
|     I| 7.0| 4.81|           null|
|     I| 5.0| 5.68|           null|
|    II|10.0| 9.14|           null|
|    II| 8.0| 8.14|           null|
|    II|13.0| 8.74|           null|
|    II| 9.0| 8.77|           null|
|    II|11.0| 9.26|           null|
|    II|14.0|  8.1|           null|
|    II| 6.0| 6.13|           null|
|    II| 4.0|  3.1|           null|
+------+----+-----+---------------+
only showing top 20 rows



In [None]:
df6 = spark.read.option("multiline", "true").json("/content/sample_data/anscombe.json")
df6.show()

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I|10.0| 8.04|
|     I| 8.0| 6.95|
|     I|13.0| 7.58|
|     I| 9.0| 8.81|
|     I|11.0| 8.33|
|     I|14.0| 9.96|
|     I| 6.0| 7.24|
|     I| 4.0| 4.26|
|     I|12.0|10.84|
|     I| 7.0| 4.81|
|     I| 5.0| 5.68|
|    II|10.0| 9.14|
|    II| 8.0| 8.14|
|    II|13.0| 8.74|
|    II| 9.0| 8.77|
|    II|11.0| 9.26|
|    II|14.0|  8.1|
|    II| 6.0| 6.13|
|    II| 4.0|  3.1|
|    II|12.0| 9.13|
+------+----+-----+
only showing top 20 rows



In [None]:
df5.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [None]:
df5.columns

['Series', 'X', 'Y', '_corrupt_record']

Truncate=When truncate is set to True (which is the default behavior if not specified), the displayed strings in the columns are truncated if they exceed a certain length, and ... is appended at the end of the truncated strings.

In [None]:
df5.show(10,truncate=True)

+------+----+-----+---------------+
|Series|   X|    Y|_corrupt_record|
+------+----+-----+---------------+
|  null|null| null|              [|
|     I|10.0| 8.04|           null|
|     I| 8.0| 6.95|           null|
|     I|13.0| 7.58|           null|
|     I| 9.0| 8.81|           null|
|     I|11.0| 8.33|           null|
|     I|14.0| 9.96|           null|
|     I| 6.0| 7.24|           null|
|     I| 4.0| 4.26|           null|
|     I|12.0|10.84|           null|
+------+----+-----+---------------+
only showing top 10 rows



In [None]:
df5.show(10,truncate=False)

+------+----+-----+---------------+
|Series|X   |Y    |_corrupt_record|
+------+----+-----+---------------+
|null  |null|null |[              |
|I     |10.0|8.04 |null           |
|I     |8.0 |6.95 |null           |
|I     |13.0|7.58 |null           |
|I     |9.0 |8.81 |null           |
|I     |11.0|8.33 |null           |
|I     |14.0|9.96 |null           |
|I     |6.0 |7.24 |null           |
|I     |4.0 |4.26 |null           |
|I     |12.0|10.84|null           |
+------+----+-----+---------------+
only showing top 10 rows



In [None]:
df6.show(20)

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I|10.0| 8.04|
|     I| 8.0| 6.95|
|     I|13.0| 7.58|
|     I| 9.0| 8.81|
|     I|11.0| 8.33|
|     I|14.0| 9.96|
|     I| 6.0| 7.24|
|     I| 4.0| 4.26|
|     I|12.0|10.84|
|     I| 7.0| 4.81|
|     I| 5.0| 5.68|
|    II|10.0| 9.14|
|    II| 8.0| 8.14|
|    II|13.0| 8.74|
|    II| 9.0| 8.77|
|    II|11.0| 9.26|
|    II|14.0|  8.1|
|    II| 6.0| 6.13|
|    II| 4.0|  3.1|
|    II|12.0| 9.13|
+------+----+-----+
only showing top 20 rows



In [None]:
df_no_duplicates=df6.dropDuplicates()

In [None]:
df_no_duplicates.show()

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I| 6.0| 7.24|
|    II|14.0|  8.1|
|     I| 8.0| 6.95|
|    IV| 8.0| 5.76|
|    IV| 8.0| 7.71|
|    IV| 8.0| 6.89|
|   III| 8.0| 6.77|
|   III|13.0|12.74|
|    II| 5.0| 4.74|
|    II| 8.0| 8.14|
|     I|12.0|10.84|
|   III|10.0| 7.46|
|    II|10.0| 9.14|
|   III|14.0| 8.84|
|    II| 7.0| 7.26|
|     I|13.0| 7.58|
|    IV| 8.0| 7.04|
|   III| 6.0| 6.08|
|     I| 5.0| 5.68|
|    II| 9.0| 8.77|
+------+----+-----+
only showing top 20 rows



In [None]:
sorted_df = df_no_duplicates.orderBy("Series")

# Show the sorted DataFrame
sorted_df.show()

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I| 8.0| 6.95|
|     I|10.0| 8.04|
|     I|12.0|10.84|
|     I|11.0| 8.33|
|     I| 9.0| 8.81|
|     I| 4.0| 4.26|
|     I| 6.0| 7.24|
|     I|13.0| 7.58|
|     I| 5.0| 5.68|
|     I| 7.0| 4.81|
|     I|14.0| 9.96|
|    II|13.0| 8.74|
|    II| 9.0| 8.77|
|    II|14.0|  8.1|
|    II| 5.0| 4.74|
|    II|12.0| 9.13|
|    II|10.0| 9.14|
|    II| 7.0| 7.26|
|    II| 4.0|  3.1|
|    II| 8.0| 8.14|
+------+----+-----+
only showing top 20 rows



In [None]:
df6.show()

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I|10.0| 8.04|
|     I| 8.0| 6.95|
|     I|13.0| 7.58|
|     I| 9.0| 8.81|
|     I|11.0| 8.33|
|     I|14.0| 9.96|
|     I| 6.0| 7.24|
|     I| 4.0| 4.26|
|     I|12.0|10.84|
|     I| 7.0| 4.81|
|     I| 5.0| 5.68|
|    II|10.0| 9.14|
|    II| 8.0| 8.14|
|    II|13.0| 8.74|
|    II| 9.0| 8.77|
|    II|11.0| 9.26|
|    II|14.0|  8.1|
|    II| 6.0| 6.13|
|    II| 4.0|  3.1|
|    II|12.0| 9.13|
+------+----+-----+
only showing top 20 rows



In [None]:
#In PySpark, the distinct() function is used to remove duplicate rows from a DataFrame.
#It returns a new DataFrame with distinct rows.
df6.distinct().show(20)

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I| 6.0| 7.24|
|    II|14.0|  8.1|
|     I| 8.0| 6.95|
|    IV| 8.0| 5.76|
|    IV| 8.0| 7.71|
|    IV| 8.0| 6.89|
|   III| 8.0| 6.77|
|   III|13.0|12.74|
|    II| 5.0| 4.74|
|    II| 8.0| 8.14|
|     I|12.0|10.84|
|   III|10.0| 7.46|
|    II|10.0| 9.14|
|   III|14.0| 8.84|
|    II| 7.0| 7.26|
|     I|13.0| 7.58|
|    IV| 8.0| 7.04|
|   III| 6.0| 6.08|
|     I| 5.0| 5.68|
|    II| 9.0| 8.77|
+------+----+-----+
only showing top 20 rows



In [None]:
df6.select("First Name").show(5)

AnalysisException: cannot resolve '`First Name`' given input columns: [age, id, name];
'Project ['First Name]
+- Project [id#148, name#149, cast(age#150 as string) AS age#167]
   +- LogicalRDD [id#148, name#149, age#150], false


In [None]:
#Filter operations
filtered_df = df6.filter(df6['X'] > 10)
filtered_df.show()


In [None]:
#Column rename
renamed_df = df6.withColumnRenamed("X", "x1")
renamed_df.show()


In [None]:

df_file1 = spark.read.option("multiline", "false").json("/content/file1.json")
df_file1.show(truncate=False)

+---+-----+
|age|name |
+---+-----+
|30 |John |
|25 |Alice|
+---+-----+

