In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("day1").getOrCreate()

data = ((1, 'Charan'),
        (2, 'Bharath'),
        (3, 'Dheeraj'))
columns = ['id', 'name']

df = spark.createDataFrame(data, columns)

df.printSchema()
df.show()


# It creates a DataFrame (a tabular, distributed data structure) from local data or an RDD.

# A Spark DataFrame is similar to a table in SQL or a DataFrame in Pandas, but distributed across a cluster.

# df.printSchema() and df.show() are two common PySpark DataFrame methods used to inspect data.

# 1. df.printSchema() — Shows the structure of the DataFrame

# This prints:

# Column names

# Data types

# Whether values are nullable

# 2.df.show() — Displays the DataFrame rows

# This prints the actual data, like a table.


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
| id|   name|
+---+-------+
|  1| Charan|
|  2|Bharath|
|  3|Dheeraj|
+---+-------+



In [5]:
data =((1,'charan'),
       (2, 'Bharat'),
       (3, 'dheeraj'))

columns = ['id', 'name']

df1= spark.createDataFrame(data, columns)

df1.printSchema()

rdd = spark.sparkContext.parallelize(data)
df2= rdd.toDF()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType ( [
    StructField ("id", IntegerType(), False),
    StructField ("name", StringType(), True),
])

df3 = spark.createDataFrame(data, schema=schema)

df3.printSchema()


# 1. StructType

# Represents the overall schema (a collection of fields, like a table structure).

# 2. StructField

# Represents one column inside the schema with:

# name

# data type

# nullable flag

# 3. StringType

# Column type → string

# 4. IntegerType

# Column type → integer

# 5. DoubleType

# Column type → double/float

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)



In [8]:
columns = StructType ( [
    StructField ("id", IntegerType(), False),
    StructField ("name", StringType(), False),
])

csv_df = spark.read.format("csv").option("header", 'true').schema(columns).load("source.csv")
csv_df.show()
csv_df.printSchema()

+----+----+
|  id|name|
+----+----+
|NULL|NULL|
|   2|   b|
|   3|   c|
|   4|   d|
|   1|   a|
+----+----+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



In [10]:
##### assignment


sc = spark.sparkContext

data = [
    ("charan", 1000),
    ("bharath", 1000),
    ("charan", 500),
    ("bharath", 2000),
    ("hari", 9000),
    ("vikram", 1500)
]

rdd = sc.parallelize(data)

result_rdd = rdd.reduceByKey(lambda a, b: a + b)

print(result_rdd.collect())


[('charan', 1500), ('bharath', 3000), ('hari', 9000), ('vikram', 1500)]


In [11]:
print(result_rdd.sortByKey().collect())
# output sorted

[('bharath', 3000), ('charan', 1500), ('hari', 9000), ('vikram', 1500)]
