In [2]:
import pickle

In [3]:
data = {'name': 'Alice', 'age': 30}
serialized_data = pickle.dumps(data)

In [4]:
serialized_data

b'\x80\x04\x95\x1c\x00\x00\x00\x00\x00\x00\x00}\x94(\x8c\x04name\x94\x8c\x05Alice\x94\x8c\x03age\x94K\x1eu.'

In [5]:
import findspark
findspark.init()

In [6]:
from pyspark.sql import SparkSession

In [7]:
spark=SparkSession.builder.config("spark.driver.host","localhost").appName('mew').getOrCreate()

In [8]:
spark

In [9]:
sc= spark.sparkContext

In [10]:
check=sc.parallelize([1,24,4])

In [11]:
check.collect()

[1, 24, 4]

In [12]:
def rt(x):
    return x*x

In [13]:
check2 = check.map(rt).collect()

In [14]:
check2

[1, 576, 16]

In [15]:
# Prepare Data
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

# Create DataFrame
df = spark.createDataFrame(data=data,schema=columns)
df.show()




+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+



In [16]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 22)]
rdd = spark.sparkContext.parallelize(data)

# Convert the RDD to a DataFrame using toDF
columns = ["Name", "Age"]
df = rdd.toDF(columns)

# Show the DataFrame
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 22|
+-------+---+



In [17]:
df.schema


StructType([StructField('Name', StringType(), True), StructField('Age', LongType(), True)])

In [20]:
df.select(df.Name.alias("check")).show()

+-------+
|  check|
+-------+
|  Alice|
|    Bob|
|Charlie|
+-------+



In [26]:
from pyspark.sql.functions import col

In [36]:
df.filter(df.Age==22).show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 22|
+-------+---+



In [29]:
df_sorted = df.orderBy(col("Age").asc()).show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 22|
|  Alice| 25|
|    Bob| 30|
+-------+---+



In [38]:
df.filter(df.Name.contains('Bob')).show()

+----+---+
|Name|Age|
+----+---+
| Bob| 30|
+----+---+



In [47]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
nested_schema = StructType([
    StructField("country", StringType(), True),
    StructField("city", StringType(), True)
])

# Outer schema
outer_schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("location", nested_schema, True)
])

# Sample data
data = [("Alice", 25, ("USA", "New York")),
        ("Bob", 30, ("Canada", "Toronto")),
        ("Charlie", 22, ("UK", "London")),
        ("Alice", 29, ("UK", "Bangor"))]

# Create a DataFrame with the nested structure
df = spark.createDataFrame(data, schema=outer_schema)

# Show the results
df.show(truncate=False)

+-------+---+-----------------+
|name   |age|location         |
+-------+---+-----------------+
|Alice  |25 |{USA, New York}  |
|Bob    |30 |{Canada, Toronto}|
|Charlie|22 |{UK, London}     |
|Alice  |29 |{UK, Bangor}     |
+-------+---+-----------------+



In [48]:
df.select(df.location.city.alias('city')).show()

+--------+
|    city|
+--------+
|New York|
| Toronto|
|  London|
|  Bangor|
+--------+



In [49]:
df.sort(col('name').asc(),col('age').desc()).show()

+-------+---+-----------------+
|   name|age|         location|
+-------+---+-----------------+
|  Alice| 29|     {UK, Bangor}|
|  Alice| 25|  {USA, New York}|
|    Bob| 30|{Canada, Toronto}|
|Charlie| 22|     {UK, London}|
+-------+---+-----------------+



In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a Spark session
spark = SparkSession.builder.appName("RepartitionExample").getOrCreate()

# Sample data with 20 rows and 4 columns
data = [
    ("Alice", 25, "Engineer", "A"),
    ("Bob", 30, "Analyst", "B"),
    ("Charlie", 22, "Intern", "C"),
    ("David", 35, "Manager", "A"),
    ("Eve", 28, "Developer", "B"),
    ("Frank", 32, "Designer", "C"),
    ("Grace", 26, "Scientist", "A"),
    ("Harry", 40, "Manager", "B"),
    ("Ivy", 31, "Analyst", "C"),
    ("Jack", 29, "Engineer", "A"),
    ("Kate", 27, "Developer", "B"),
    ("Leo", 33, "Designer", "C"),
    ("Mia", 24, "Scientist", "A"),
    ("Nathan", 38, "Engineer", "B"),
    ("Olivia", 23, "Analyst", "C"),
    ("Peter", 36, "Manager", "A"),
    ("Quinn", 34, "Scientist", "B"),
    ("Rachel", 39, "Developer", "C"),
    ("Sam", 37, "Designer", "A"),
    ("Tina", 21, "Intern", "B")
]

# Define schema
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Occupation", StringType(), True),
    StructField("Group", StringType(), True)
])

# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the original DataFrame
print("Original DataFrame:")
df.show()



Original DataFrame:
+-------+---+----------+-----+
|   Name|Age|Occupation|Group|
+-------+---+----------+-----+
|  Alice| 25|  Engineer|    A|
|    Bob| 30|   Analyst|    B|
|Charlie| 22|    Intern|    C|
|  David| 35|   Manager|    A|
|    Eve| 28| Developer|    B|
|  Frank| 32|  Designer|    C|
|  Grace| 26| Scientist|    A|
|  Harry| 40|   Manager|    B|
|    Ivy| 31|   Analyst|    C|
|   Jack| 29|  Engineer|    A|
|   Kate| 27| Developer|    B|
|    Leo| 33|  Designer|    C|
|    Mia| 24| Scientist|    A|
| Nathan| 38|  Engineer|    B|
| Olivia| 23|   Analyst|    C|
|  Peter| 36|   Manager|    A|
|  Quinn| 34| Scientist|    B|
| Rachel| 39| Developer|    C|
|    Sam| 37|  Designer|    A|
|   Tina| 21|    Intern|    B|
+-------+---+----------+-----+



In [82]:

# partitionBy() control number of partitions 
df.write.option("header",True)\
        .mode("overwrite")\
        .csv("Team",sep=',') 

Py4JJavaError: An error occurred while calling o583.csv.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: Hadoop home directory C:\hadoop-3.3.6 does not exist -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:269)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:847)
	at sun.reflect.GeneratedMethodAccessor360.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: Hadoop home directory C:\hadoop-3.3.6 does not exist -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:341)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:331)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:370)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:955)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:192)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:215)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1111)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1120)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: Hadoop home directory C:\hadoop-3.3.6 does not exist
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:490)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 23 more
