In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Python Spark Session").getOrCreate()
df = spark.read.csv("file:///D:/Big-Data-Science/Big-Data-Analysis-with-Python-master/Lesson03/data/test.csv", header=True)
df.show()

+------+---+------+
|  name|age|height|
+------+---+------+
|  Jonh| 22|  1.80|
|Hughes| 34|  1.96|
|  Mary| 27|  1.56|
+------+---+------+



In [4]:
df['age'].Column['age']

Column<b'age[Column][age]'>

In [5]:
df.select(df['name']).show()

+------+
|  name|
+------+
|  Jonh|
|Hughes|
|  Mary|
+------+



In [6]:
df.select(df['name'],df['age']+1).show()

+------+---------+
|  name|(age + 1)|
+------+---------+
|  Jonh|     23.0|
|Hughes|     35.0|
|  Mary|     28.0|
+------+---------+



In [7]:
df.filter(df['age'] > 25).show()

+------+---+------+
|  name|age|height|
+------+---+------+
|Hughes| 34|  1.96|
|  Mary| 27|  1.56|
+------+---+------+



In [8]:
df.groupBy("age").count().show()

+---+-----+
|age|count|
+---+-----+
| 34|    1|
| 22|    1|
| 27|    1|
+---+-----+



In [9]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- height: string (nullable = true)



In [10]:
df.createOrReplaceTempView("people")
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+------+---+------+
|  name|age|height|
+------+---+------+
|  Jonh| 22|  1.80|
|Hughes| 34|  1.96|
|  Mary| 27|  1.56|
+------+---+------+



In [11]:
df.createGlobalTempView("people")
spark.sql("SELECT * FROM global_temp.people").show()
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+------+---+------+
|  name|age|height|
+------+---+------+
|  Jonh| 22|  1.80|
|Hughes| 34|  1.96|
|  Mary| 27|  1.56|
+------+---+------+

+------+---+------+
|  name|age|height|
+------+---+------+
|  Jonh| 22|  1.80|
|Hughes| 34|  1.96|
|  Mary| 27|  1.56|
+------+---+------+



In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark Session").getOrCreate()

In [14]:
df = spark.read.json("hdfs://localhost:9000/data/userdata.json")
#df = spark.read.json("data\people.json", multiLine=True)

In [15]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [16]:
df.show()

+---+-------+
|age|   name|
+---+-------+
| 12|Michael|
| 13|   Andy|
|  8| Justin|
+---+-------+



In [17]:
df.write.csv("hdfs://localhost:9000/data/userdata_new.csv", header=True)

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark Session").getOrCreate()

In [19]:
from operator import add
rdd_df = spark.read.text("file:///D:/Big-Data-Science/Big-Data-Analysis-with-Python-master/Lesson03/data/shake.txt").rdd
lines = rdd_df.map(lambda line: line[0])
lines.collect()

['To be, or not to be, that is the question:',
 "Whether 'tis nobler in the mind to suffer",
 'The slings and arrows of outrageous fortune,',
 'Or to take arms against a sea of troubles',
 'And by opposing end them. To die—to sleep,',
 'No more; and by a sleep to say we end',
 'The heart-ache and the thousand natural shocks',
 "That flesh is heir to: 'tis a consummation",
 "Devoutly to be wish'd. To die, to sleep;",
 "To sleep, perchance to dream—ay, there's the rub:",
 'For in that sleep of death what dreams may come,',
 'When we have shuffled off this mortal coil,',
 "Must give us pause—there's the respect",
 'That makes calamity of so long life.']

In [20]:
lines.count()

14

In [21]:
splits = lines.flatMap(lambda x: x.split(' '))
lower_splits = splits.map(lambda x: x.lower().strip())
prep = ['the', 'a', ',', '.']
tokens = lower_splits.filter(lambda x: x and x not in prep)
token_list = tokens.map(lambda x: [x, 1])
count = token_list.reduceByKey(add).sortBy(lambda x: x[1], ascending=False)

In [22]:
count.collect()

[('to', 11),
 ('that', 4),
 ('and', 4),
 ('of', 4),
 ('be,', 2),
 ('or', 2),
 ('is', 2),
 ("'tis", 2),
 ('in', 2),
 ('by', 2),
 ('end', 2),
 ('sleep,', 2),
 ('sleep', 2),
 ('we', 2),
 ('not', 1),
 ('question:', 1),
 ('whether', 1),
 ('nobler', 1),
 ('mind', 1),
 ('suffer', 1),
 ('slings', 1),
 ('arrows', 1),
 ('outrageous', 1),
 ('fortune,', 1),
 ('take', 1),
 ('arms', 1),
 ('against', 1),
 ('sea', 1),
 ('troubles', 1),
 ('opposing', 1),
 ('them.', 1),
 ('die—to', 1),
 ('no', 1),
 ('more;', 1),
 ('say', 1),
 ('heart-ache', 1),
 ('thousand', 1),
 ('natural', 1),
 ('shocks', 1),
 ('flesh', 1),
 ('heir', 1),
 ('to:', 1),
 ('consummation', 1),
 ('devoutly', 1),
 ('be', 1),
 ("wish'd.", 1),
 ('die,', 1),
 ('sleep;', 1),
 ('perchance', 1),
 ('dream—ay,', 1),
 ("there's", 1),
 ('rub:', 1),
 ('for', 1),
 ('death', 1),
 ('what', 1),
 ('dreams', 1),
 ('may', 1),
 ('come,', 1),
 ('when', 1),
 ('have', 1),
 ('shuffled', 1),
 ('off', 1),
 ('this', 1),
 ('mortal', 1),
 ('coil,', 1),
 ('must', 1),
 (