In [43]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('String Function').getOrCreate()

In [44]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [45]:
data = [  (1, 'John', 'Doe', 'john.doe@gmail.com', 'New York'),  
          (2, 'Jane', 'Doe', 'jane.doe@yahoo.com', 'Los Angeles'), 
          (3, 'James', 'Smith', 'james.smith@gmail.com', 'Chicago'),  
          (4, 'mary', 'Johnson', 'mary.johnson@example.in', 'Houston')
       ]

In [46]:
data

[(1, 'John', 'Doe', 'john.doe@gmail.com', 'New York'),
 (2, 'Jane', 'Doe', 'jane.doe@yahoo.com', 'Los Angeles'),
 (3, 'James', 'Smith', 'james.smith@gmail.com', 'Chicago'),
 (4, 'mary', 'Johnson', 'mary.johnson@example.in', 'Houston')]

In [47]:
schema = StructType([
    StructField('id',       IntegerType(), True),
    StructField('Name',     StringType(), True),
    StructField('Lastname', StringType(), True),
    StructField('email',    StringType(), True),
    StructField('City',     StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show(truncate=False)

+---+-----+--------+-----------------------+-----------+
|id |Name |Lastname|email                  |City       |
+---+-----+--------+-----------------------+-----------+
|1  |John |Doe     |john.doe@gmail.com     |New York   |
|2  |Jane |Doe     |jane.doe@yahoo.com     |Los Angeles|
|3  |James|Smith   |james.smith@gmail.com  |Chicago    |
|4  |mary |Johnson |mary.johnson@example.in|Houston    |
+---+-----+--------+-----------------------+-----------+



1) Make first letter of Name and Lastname as Capital
2) Combine First and last name into 1
3) Pull the domain of email id
4) Pull the letter before domain
5) Find the counts number of people living in each city
6) Find the count of dataframe

In [67]:
from pyspark.sql.functions import initcap, col, concat, lit, regexp_extract, instr, substring, split,substring_index

df = df\
    .withColumn('Name', initcap(col('Name')))\
    .withColumn('Full Name', concat(col('Name'), lit(' '), col('Lastname')))\
    .withColumn('domain', regexp_extract(col('email'), r'([a-zA-z]+)\.com|in$',1))\
    .withColumn('st1', initcap(regexp_extract(col('email'), r'([a-zA-z]+)\.([a-zA-Z]+)@([a-zA-Z]+)',1)))\
    .withColumn('st2', initcap(regexp_extract(col('email'), r'([a-zA-z]+)\.([a-zA-Z]+)@([a-zA-Z]+)',2)))\
    .withColumn('st3', regexp_extract(col('email'), r'([a-zA-z]+)\.([a-zA-Z]+)@([a-zA-Z]+)',3))\
    .withColumn('st4', regexp_extract(col('email'), r'([a-zA-z]+)@([a-zA-Z]+)',1))\
    .withColumn('pos', instr(col('email'),'@'))

df.show(truncate=False)

+---+-----+--------+-----------------------+-----------+------------+------+-----+-------+-------+-------+---+------+------------+
|id |Name |Lastname|email                  |City       |Full Name   |domain|st1  |st2    |st3    |st4    |pos|substr|email_1_part|
+---+-----+--------+-----------------------+-----------+------------+------+-----+-------+-------+-------+---+------+------------+
|1  |John |Doe     |john.doe@gmail.com     |New York   |John Doe    |gmail |John |Doe    |gmail  |doe    |9  |john.d|John        |
|2  |Jane |Doe     |jane.doe@yahoo.com     |Los Angeles|Jane Doe    |yahoo |Jane |Doe    |yahoo  |doe    |9  |jane.d|Jane        |
|3  |James|Smith   |james.smith@gmail.com  |Chicago    |James Smith |gmail |James|Smith  |gmail  |smith  |12 |james.|James       |
|4  |Mary |Johnson |mary.johnson@example.in|Houston    |Mary Johnson|      |Mary |Johnson|example|johnson|13 |mary.j|Mary        |
+---+-----+--------+-----------------------+-----------+------------+------+-----+-

In [49]:
from pyspark.sql.window import *
group_by1 = df.groupBy('domain').count()
group_by1.show()

+------+-----+
|domain|count|
+------+-----+
| gmail|    2|
| yahoo|    1|
|      |    1|
+------+-----+



In [50]:
df.count()

4

In [56]:
df1 = df.select('email')


df1 =  df1\
       .withColumn('first_name', initcap(substring_index( split(col('email'), '@')[0],'.',1)))\
       .withColumn('last_name', initcap(substring_index( split(col('email'), '@')[0],'.',-1)))\
       .withColumn('Full_name', concat(col('first_name'), lit(' '), col('last_name')))\
       .withColumn('domain', regexp_extract(col('email'), r'([a-zA-Z]+)\.([a-zA-Z]+)$',1))\
       .withColumn('domain_1', regexp_extract(col('email'), r'([a-zA-Z]+)\.([a-zA-Z]+)$',2))    


df1.show(truncate=False)

+-----------------------+----------+---------+------------+-------+--------+
|email                  |first_name|last_name|Full_name   |domain |domain_1|
+-----------------------+----------+---------+------------+-------+--------+
|john.doe@gmail.com     |John      |Doe      |John Doe    |gmail  |com     |
|jane.doe@yahoo.com     |Jane      |Doe      |Jane Doe    |yahoo  |com     |
|james.smith@gmail.com  |James     |Smith    |James Smith |gmail  |com     |
|mary.johnson@example.in|Mary      |Johnson  |Mary Johnson|example|in      |
+-----------------------+----------+---------+------------+-------+--------+

