In [0]:
from pyspark.sql import SparkSession

In [0]:
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]
columns = ["firstname","lastname","country","state"]
df=spark.createDataFrame(data, schema = columns)
df.printSchema()
df.show()

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [0]:
df.write.orc("/FileStore/tables/Orcdata1.orc")

In [0]:
spark.read.orc("/FileStore/tables/Orcdata.orc")

Out[7]: DataFrame[firstname: string, lastname: string, country: string, state: string]

In [0]:
#Taking the files from web
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
from pyspark import SparkFiles
spark.sparkContext.addFile(url)


In [0]:
#reading the file to create the dataframe
df = spark.read.csv("file://"+SparkFiles.get("iris.data"), inferSchema= True)

In [0]:
#priting the column and showing the information
df.printSchema()
df.show()

root
 |-- _c0: double (nullable = true)
 |-- _c1: double (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: string (nullable = true)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
|5.4|3.7|1.5|0.2|Iris-setosa|
|4.8|3.4|1.6|0.2|Iris-setosa|
|4.8|3.0|1.4|0.1|Iris-setosa|
|4.3|3.0|1.1|0.1|Iris-setosa|
|5.8|4.0|1.2|0.2|Iris-setosa|
|5.7|4.4|1.5|0.4|Iris-setosa|
|5.4|3.9|1.3|0.4|Iris-setosa|
|5.1|3.5|1.4|0.3|Iris-setosa|
|5.7|3.8|1.7|0.3|Iris-setosa|
|5.1|3.8|1.5|0.3|Iris-setosa|
+---+---+---+---+-----------+
only showing top 20 rows



In [0]:
df1=df.withColumn("class", regexp_replace("_c4", "Iris", "Iris-Dataset"))
df1.show()

+---+---+---+---+-----------+-------------------+
|_c0|_c1|_c2|_c3|        _c4|              class|
+---+---+---+---+-----------+-------------------+
|5.1|3.5|1.4|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|Iris-Dataset-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|Iris-Dataset-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|Iris-Dataset-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|Iris-Dataset-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|Iris-Dataset-setosa|
|5.4|3.7|1.5|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.8|3.4|1.6|0.2|Iris-setosa|Iris-Dataset-setosa|
|4.8|3.0|1.4|0.1|Iris-setosa|Iris-Dataset-setosa|
|4.3|3.0|1.1|0.1|Iris-setosa|Iris-Dataset-setosa|
|5.8|4.0|1.2|0.2|Iris-setosa|Iris-Dataset-setosa|
|5.7|4.4|1.5|0.4|Iris-setosa|Iris-Dataset-setosa|
|5.4|3.9|1.3|0.4|Iris-setosa|Iris-Dataset-setosa|


In [0]:
df1.select('class').distinct().rdd.map(lambda r: r[0]).collect()

Out[58]: ['Iris-Dataset-versicolor', 'Iris-Dataset-virginica', 'Iris-Dataset-setosa']

In [0]:
#Computing basic statistics for numeric and string columns
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|               _c0|                _c1|               _c2|               _c3|           _c4|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          null|
|    min|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [0]:
address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
                                                                                        Columns-id, address,state

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("Replace").getOrCreate()
address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df =spark.createDataFrame(address,["id","address","state"])
df.show()

+---+------------------+-----+
| id|           address|state|
+---+------------------+-----+
|  1|  14851 Jeffrey Rd|   DE|
|  2|43421 Margarita St|   NY|
|  3|  13111 Siemon Ave|   CA|
+---+------------------+-----+



In [0]:
from pyspark.sql.functions import regexp_replace

# replace "rd" with "road" in the address column
df1=df.withColumn("address1", regexp_replace("address", "Rd", "Road"))

# show the updated DataFrame
df1.show()

+---+------------------+-----+------------------+
| id|           address|state|          address1|
+---+------------------+-----+------------------+
|  1|  14851 Jeffrey Rd|   DE|14851 Jeffrey Road|
|  2|43421 Margarita St|   NY|43421 Margarita St|
|  3|  13111 Siemon Ave|   CA|  13111 Siemon Ave|
+---+------------------+-----+------------------+



In [0]:
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]
df_lastest = spark.

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]

schema = StructType([
    StructField("name", StructType([
        StructField("first_name", StringType(), True),
        StructField("middle_name", StringType(), True),
        StructField("last_name", StringType(), True)
    ]), True),
    StructField("languages", ArrayType(StringType(), True), True),
    StructField("state", StringType(), True),
    StructField("gender", StringType(), True)
])

detail_df=spark.createDataFrame(data,schema)
detail_df.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [0]:
#detail df with OH state
detail=detail_df.filter(detail_df.state =="OH")
detail.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [0]:
#detail df with not OH state
detail=detail_df.filter(detail_df.state !="OH")
detail.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [0]:
#name middle name is not empty
detail=detail_df.filter(detail_df.name.middle_name !="")
detail.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [0]:
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"DE","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"CA","M")
 ]

schema = StructType([
    StructField("name", StructType([
        StructField("first_name", StringType(), True),
        StructField("middle_name", StringType(), True),
        StructField("last_name", StringType(), True)
    ]), True),
    StructField("languages", ArrayType(StringType(), True), True),
    StructField("state", StringType(), True),
    StructField("gender", StringType(), True)
])

latest_df=spark.createDataFrame(data,schema)
latest_df.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
| {Julia, , Williams}|      [CSharp, VB]|   DE|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   CA|     M|
+--------------------+------------------+-----+------+



In [0]:
#whether oh or ca or de
detail=latest_df.filter((latest_df.state == "OH") | (latest_df.state == "CA") | (latest_df.state == "DE"))
detail.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   DE|     F|
|{Mike, Mary, Will...|      [Python, VB]|   CA|     M|
+--------------------+------------------+-----+------+



In [0]:
#states which starts from H
detail = latest_df.filter(latest_df.state.startswith("H"))
detail.show()


+----+---------+-----+------+
|name|languages|state|gender|
+----+---------+-----+------+
+----+---------+-----+------+



In [0]:
# states which end A
detail = latest_df.filter(latest_df.state.endswith("A"))
detail.show()


+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
|{Mike, Mary, Will...|[Python, VB]|   CA|     M|
+--------------------+------------+-----+------+



In [0]:
#states which has C contains
detail = latest_df.filter(latest_df.state.like("%C%"))
detail.show()

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
|{Mike, Mary, Will...|[Python, VB]|   CA|     M|
+--------------------+------------+-----+------+

