In [None]:
!pip install pyspark
from pyspark.sql import SparkSession

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=f77d52f2e03bb5374f49f9cc1654fdce14f47dbadcf7eaaff2dc818768633584
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [None]:

spark = SparkSession.builder.appName("StringFunctionsExample").getOrCreate()

data = [("Toyota Camry", "Sedan", "2022"),
        ("Honda Civic", "Sedan", "2022"),
        ("Ford F-150", "Truck", "2022"),
        ("Tesla Model 3", "Electric", "2022"),
        ("Chevrolet Malibu", "Sedan", "2022")]
columns = ["model", "type", "year"]

df = spark.createDataFrame(data, columns)

df.createOrReplaceTempView("cars")


In [None]:


concatenated_df = spark.sql("SELECT concat_ws(' - ', model, type, year) AS details FROM cars")
print("1. Concatenated Strings:")
concatenated_df.show(truncate=False)


1. Concatenated Strings:
+-------------------------------+
|details                        |
+-------------------------------+
|Toyota Camry - Sedan - 2022    |
|Honda Civic - Sedan - 2022     |
|Ford F-150 - Truck - 2022      |
|Tesla Model 3 - Electric - 2022|
|Chevrolet Malibu - Sedan - 2022|
+-------------------------------+



In [None]:
length_df = spark.sql("SELECT model, length(type) AS type_length FROM cars")

print("2. Length of Types:")
length_df.show()

2. Length of Types:
+----------------+-----------+
|           model|type_length|
+----------------+-----------+
|    Toyota Camry|          5|
|     Honda Civic|          5|
|      Ford F-150|          5|
|   Tesla Model 3|          8|
|Chevrolet Malibu|          5|
+----------------+-----------+



In [None]:
substring_df = spark.sql("SELECT model, substring(type, 1, 4) AS type_abbr FROM cars")

print("3. Substring of Types:")
substring_df.show()

3. Substring of Types:
+----------------+---------+
|           model|type_abbr|
+----------------+---------+
|    Toyota Camry|     Seda|
|     Honda Civic|     Seda|
|      Ford F-150|     Truc|
|   Tesla Model 3|     Elec|
|Chevrolet Malibu|     Seda|
+----------------+---------+



In [None]:
uppercase_df = spark.sql("SELECT model, type, upper(type) AS uppercase_type FROM cars")

print("4. Uppercase Types:")
uppercase_df.show()

4. Uppercase Types:
+----------------+--------+--------------+
|           model|    type|uppercase_type|
+----------------+--------+--------------+
|    Toyota Camry|   Sedan|         SEDAN|
|     Honda Civic|   Sedan|         SEDAN|
|      Ford F-150|   Truck|         TRUCK|
|   Tesla Model 3|Electric|      ELECTRIC|
|Chevrolet Malibu|   Sedan|         SEDAN|
+----------------+--------+--------------+



In [None]:
lowercase_df = spark.sql("SELECT model, type, lower(type) AS lowercase_type FROM cars")
print("5. Lowercase Types:")
lowercase_df.show()

5. Lowercase Types:
+----------------+--------+--------------+
|           model|    type|lowercase_type|
+----------------+--------+--------------+
|    Toyota Camry|   Sedan|         sedan|
|     Honda Civic|   Sedan|         sedan|
|      Ford F-150|   Truck|         truck|
|   Tesla Model 3|Electric|      electric|
|Chevrolet Malibu|   Sedan|         sedan|
+----------------+--------+--------------+



In [None]:

spark.stop()

In [None]:
from pyspark.sql.functions import base64
from pyspark.sql.functions import col
encoded_df = df.withColumn("model_base64", base64(col("model")))
encoded_df.show()

+----------------+--------+----+--------------------+
|           model|    type|year|        model_base64|
+----------------+--------+----+--------------------+
|    Toyota Camry|   Sedan|2022|    VG95b3RhIENhbXJ5|
|     Honda Civic|   Sedan|2022|    SG9uZGEgQ2l2aWM=|
|      Ford F-150|   Truck|2022|    Rm9yZCBGLTE1MA==|
|   Tesla Model 3|Electric|2022|VGVzbGEgTW9kZWwgMw==|
|Chevrolet Malibu|   Sedan|2022|Q2hldnJvbGV0IE1hb...|
+----------------+--------+----+--------------------+



In [None]:
from pyspark.sql.functions import ascii

ascii_df = df.withColumn("model_ascii", ascii(col("model")))
ascii_df.show()

+----------------+--------+----+-----------+
|           model|    type|year|model_ascii|
+----------------+--------+----+-----------+
|    Toyota Camry|   Sedan|2022|         84|
|     Honda Civic|   Sedan|2022|         72|
|      Ford F-150|   Truck|2022|         70|
|   Tesla Model 3|Electric|2022|         84|
|Chevrolet Malibu|   Sedan|2022|         67|
+----------------+--------+----+-----------+



In [None]:
from pyspark.sql.functions import concat_ws

concatenated_df = df.withColumn("details", concat_ws("-", col("model"), col("type"), col("year")))
concatenated_df.show()

+----------------+--------+----+--------------------+
|           model|    type|year|             details|
+----------------+--------+----+--------------------+
|    Toyota Camry|   Sedan|2022|Toyota Camry-Seda...|
|     Honda Civic|   Sedan|2022|Honda Civic-Sedan...|
|      Ford F-150|   Truck|2022|Ford F-150-Truck-...|
|   Tesla Model 3|Electric|2022|Tesla Model 3-Ele...|
|Chevrolet Malibu|   Sedan|2022|Chevrolet Malibu-...|
+----------------+--------+----+--------------------+



In [None]:
from pyspark.sql.functions import length

length_df = df.withColumn("model_length", length(col("model")))
length_df.show()

+----------------+--------+----+------------+
|           model|    type|year|model_length|
+----------------+--------+----+------------+
|    Toyota Camry|   Sedan|2022|          12|
|     Honda Civic|   Sedan|2022|          11|
|      Ford F-150|   Truck|2022|          10|
|   Tesla Model 3|Electric|2022|          13|
|Chevrolet Malibu|   Sedan|2022|          16|
+----------------+--------+----+------------+



In [None]:
from pyspark.sql.functions import instr

position_df = df.withColumn("camry_position", instr(col("model"), "Camry"))
position_df.show()

+----------------+--------+----+--------------+
|           model|    type|year|camry_position|
+----------------+--------+----+--------------+
|    Toyota Camry|   Sedan|2022|             8|
|     Honda Civic|   Sedan|2022|             0|
|      Ford F-150|   Truck|2022|             0|
|   Tesla Model 3|Electric|2022|             0|
|Chevrolet Malibu|   Sedan|2022|             0|
+----------------+--------+----+--------------+



In [None]:
data = [("kitten", "sitting"),
        ("flaw", "lawn"),
        ("hello", "world")]
columns = ["string1", "string2"]

df = spark.createDataFrame(data, columns)

distance_df = df.withColumn("levenshtein_distance", levenshtein(col("string1"), col("string2")))
distance_df.show()

+-------+-------+--------------------+
|string1|string2|levenshtein_distance|
+-------+-------+--------------------+
| kitten|sitting|                   3|
|   flaw|   lawn|                   2|
|  hello|  world|                   4|
+-------+-------+--------------------+



In [None]:
from pyspark.sql.functions import levenshtein

levenshtein_df = df.withColumn("levenshtein_distance", levenshtein(col("model"), col("type")))
levenshtein_df.show()

+----------------+--------+----+--------------------+
|           model|    type|year|levenshtein_distance|
+----------------+--------+----+--------------------+
|    Toyota Camry|   Sedan|2022|                  11|
|     Honda Civic|   Sedan|2022|                   9|
|      Ford F-150|   Truck|2022|                   9|
|   Tesla Model 3|Electric|2022|                  12|
|Chevrolet Malibu|   Sedan|2022|                  14|
+----------------+--------+----+--------------------+



In [None]:
from pyspark.sql.functions import ltrim

trimmed_df = df.withColumn("model_trimmed", ltrim(df['model']))
trimmed_df.show()

+----------------+--------+----+----------------+
|           model|    type|year|   model_trimmed|
+----------------+--------+----+----------------+
|    Toyota Camry|   Sedan|2022|    Toyota Camry|
|     Honda Civic|   Sedan|2022|     Honda Civic|
|      Ford F-150|   Truck|2022|      Ford F-150|
|   Tesla Model 3|Electric|2022|   Tesla Model 3|
|Chevrolet Malibu|   Sedan|2022|Chevrolet Malibu|
+----------------+--------+----+----------------+



In [None]:
from pyspark.sql.functions import locate

position_df = df.withColumn("sedan_position", locate("Sedan", col("type"), 1))
position_df.show()

+----------------+--------+----+--------------+
|           model|    type|year|sedan_position|
+----------------+--------+----+--------------+
|    Toyota Camry|   Sedan|2022|             1|
|     Honda Civic|   Sedan|2022|             1|
|      Ford F-150|   Truck|2022|             0|
|   Tesla Model 3|Electric|2022|             0|
|Chevrolet Malibu|   Sedan|2022|             1|
+----------------+--------+----+--------------+



In [None]:
from pyspark.sql.functions import regexp_replace

# Replace "Sedan" with "Compact" in the "type" column
replaced_df = df.withColumn("type_replaced", regexp_replace(col("type"), "Sedan", "Compact"))
replaced_df.show()

+----------------+--------+----+-------------+
|           model|    type|year|type_replaced|
+----------------+--------+----+-------------+
|    Toyota Camry|   Sedan|2022|      Compact|
|     Honda Civic|   Sedan|2022|      Compact|
|      Ford F-150|   Truck|2022|        Truck|
|   Tesla Model 3|Electric|2022|     Electric|
|Chevrolet Malibu|   Sedan|2022|      Compact|
+----------------+--------+----+-------------+



In [None]:
from pyspark.sql.functions import initcap

capitalized_df = df.withColumn("model_capitalized", initcap(col("model")))
capitalized_df.show()

+----------------+--------+----+-----------------+
|           model|    type|year|model_capitalized|
+----------------+--------+----+-----------------+
|    Toyota Camry|   Sedan|2022|     Toyota Camry|
|     Honda Civic|   Sedan|2022|      Honda Civic|
|      Ford F-150|   Truck|2022|       Ford F-150|
|   Tesla Model 3|Electric|2022|    Tesla Model 3|
|Chevrolet Malibu|   Sedan|2022| Chevrolet Malibu|
+----------------+--------+----+-----------------+



In [None]:
from pyspark.sql.functions import regexp_replace

replaced_df = df.withColumn("type_replaced", regexp_replace(col("type"), "Electric", "EV"))
replaced_df.show()

+----------------+--------+----+-------------+
|           model|    type|year|type_replaced|
+----------------+--------+----+-------------+
|    Toyota Camry|   Sedan|2022|        Sedan|
|     Honda Civic|   Sedan|2022|        Sedan|
|      Ford F-150|   Truck|2022|        Truck|
|   Tesla Model 3|Electric|2022|           EV|
|Chevrolet Malibu|   Sedan|2022|        Sedan|
+----------------+--------+----+-------------+



In [None]:
from pyspark.sql.functions import regexp_extract

extracted_df = df.withColumn("model_digits", regexp_extract(col("model"), "(\d+)", 1))
extracted_df.show()

+----------------+--------+----+------------+
|           model|    type|year|model_digits|
+----------------+--------+----+------------+
|    Toyota Camry|   Sedan|2022|            |
|     Honda Civic|   Sedan|2022|            |
|      Ford F-150|   Truck|2022|         150|
|   Tesla Model 3|Electric|2022|           3|
|Chevrolet Malibu|   Sedan|2022|            |
+----------------+--------+----+------------+



In [None]:
from pyspark.sql.functions import encode

encoded_df = df.withColumn("model_encoded", encode(col("model"), "UTF-8"))
encoded_df.show()

+----------------+--------+----+--------------------+
|           model|    type|year|       model_encoded|
+----------------+--------+----+--------------------+
|    Toyota Camry|   Sedan|2022|[54 6F 79 6F 74 6...|
|     Honda Civic|   Sedan|2022|[48 6F 6E 64 61 2...|
|      Ford F-150|   Truck|2022|[46 6F 72 64 20 4...|
|   Tesla Model 3|Electric|2022|[54 65 73 6C 61 2...|
|Chevrolet Malibu|   Sedan|2022|[43 68 65 76 72 6...|
+----------------+--------+----+--------------------+



In [None]:
from pyspark.sql.functions import decode

decoded_df = encoded_df.withColumn("model_decoded", decode(col("model_encoded"), "UTF-8"))
decoded_df.show()

+----------------+--------+----+--------------------+----------------+
|           model|    type|year|       model_encoded|   model_decoded|
+----------------+--------+----+--------------------+----------------+
|    Toyota Camry|   Sedan|2022|[54 6F 79 6F 74 6...|    Toyota Camry|
|     Honda Civic|   Sedan|2022|[48 6F 6E 64 61 2...|     Honda Civic|
|      Ford F-150|   Truck|2022|[46 6F 72 64 20 4...|      Ford F-150|
|   Tesla Model 3|Electric|2022|[54 65 73 6C 61 2...|   Tesla Model 3|
|Chevrolet Malibu|   Sedan|2022|[43 68 65 76 72 6...|Chevrolet Malibu|
+----------------+--------+----+--------------------+----------------+



In [None]:
from pyspark.sql.functions import format_number

formatted_df = df.withColumn("age_formatted", format_number(col("year").cast("double"), 2))
formatted_df.show()

+----------------+--------+----+-------------+
|           model|    type|year|age_formatted|
+----------------+--------+----+-------------+
|    Toyota Camry|   Sedan|2022|     2,022.00|
|     Honda Civic|   Sedan|2022|     2,022.00|
|      Ford F-150|   Truck|2022|     2,022.00|
|   Tesla Model 3|Electric|2022|     2,022.00|
|Chevrolet Malibu|   Sedan|2022|     2,022.00|
+----------------+--------+----+-------------+



In [None]:
from pyspark.sql.functions import format_string

formatted_string_df = df.withColumn(
    "formatted_details",
    format_string("%s (%s) - %s", col("model"), col("type"), col("year"))
)
formatted_string_df.show(truncate=False)

+----------------+--------+----+-------------------------------+
|model           |type    |year|formatted_details              |
+----------------+--------+----+-------------------------------+
|Toyota Camry    |Sedan   |2022|Toyota Camry (Sedan) - 2022    |
|Honda Civic     |Sedan   |2022|Honda Civic (Sedan) - 2022     |
|Ford F-150      |Truck   |2022|Ford F-150 (Truck) - 2022      |
|Tesla Model 3   |Electric|2022|Tesla Model 3 (Electric) - 2022|
|Chevrolet Malibu|Sedan   |2022|Chevrolet Malibu (Sedan) - 2022|
+----------------+--------+----+-------------------------------+



In [None]:
from pyspark.sql.functions import locate

position_df = df.withColumn("camry_position", locate("Camry", col("model")))
position_df.show()