## How to add Sequence generated surrogate key as a column in dataframe.
* __`Answer`__ using `monotonically_increasing_id` or `hash` functions we can generate sequence or surrogate key

In [0]:
from pyspark.sql.functions import monotonically_increasing_id
df = spark.read.option("nullValue","null").csv("/FileStore/tables/emp.csv",header=True,inferSchema=True)
# Creating new column as partition_id using monotonically_increasing_id() function
df = df.withColumn("ID_KEY",monotonically_increasing_id())
display(df)

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,ID_KEY
7369.0,SMITH,CLERK,7902.0,17-12-1980,800.0,,20.0,0
7499.0,ALLEN,SALESMAN,7698.0,20-02-1981,1600.0,300.0,30.0,1
7521.0,WARD,SALESMAN,7698.0,22-02-1981,1250.0,500.0,30.0,2
7566.0,JONES,MANAGER,7839.0,04-02-1981,2975.0,,20.0,3
7654.0,MARTIN,SALESMAN,7698.0,21-09-1981,1250.0,1400.0,30.0,4
7698.0,SGR,MANAGER,7839.0,05-01-1981,2850.0,,30.0,5
7782.0,RAVI,MANAGER,7839.0,06-09-1981,2450.0,,10.0,6
7788.0,SCOTT,ANALYST,7566.0,19-04-1987,3000.0,,20.0,7
7839.0,KING,PRESIDENT,,01-11-1981,5000.0,,10.0,8
7844.0,TURNER,SALESMAN,7698.0,09-08-1981,1500.0,0.0,30.0,9


### Using `MD5`

In [0]:
from pyspark.sql.functions import md5,col

# Creating new column as partition_id using md5() function
df  = df.withColumn("MD5_KEY",md5(col("EMPNO").cast("string")))
display(df)

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,ID_KEY,MD5_KEY
7369.0,SMITH,CLERK,7902.0,17-12-1980,800.0,,20.0,0,0d7f9017fbda691900187b22404b8a1f
7499.0,ALLEN,SALESMAN,7698.0,20-02-1981,1600.0,300.0,30.0,1,7a2b33c672ce223b2aa5789171ddde2f
7521.0,WARD,SALESMAN,7698.0,22-02-1981,1250.0,500.0,30.0,2,e1e1f667ce4596e5644be6fab627c226
7566.0,JONES,MANAGER,7839.0,04-02-1981,2975.0,,20.0,3,b937384a573b94c4d7cc6004c496f919
7654.0,MARTIN,SALESMAN,7698.0,21-09-1981,1250.0,1400.0,30.0,4,e2a7555f7cabd6e31aef45cb8cda4999
7698.0,SGR,MANAGER,7839.0,05-01-1981,2850.0,,30.0,5,c570c225d1fb8a72ad79995dd17a77bc
7782.0,RAVI,MANAGER,7839.0,06-09-1981,2450.0,,10.0,6,ac5c482277858d6fe45065d0a3f92b0c
7788.0,SCOTT,ANALYST,7566.0,19-04-1987,3000.0,,20.0,7,866c7ee013c58f01fa153a8d32c9ed57
7839.0,KING,PRESIDENT,,01-11-1981,5000.0,,10.0,8,ca91c5464e73d3066825362c3093a45f
7844.0,TURNER,SALESMAN,7698.0,09-08-1981,1500.0,0.0,30.0,9,b356e7aed7ee82589e54a466e0dca157


### Using `CRC32`

In [0]:
from pyspark.sql.functions import crc32,col

# Creating new column as partition_id using md5() function
df = df.withColumn("CRC32_KEY",crc32(col("EMPNO").cast("string")))
display(df)

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,ID_KEY,MD5_KEY,CRC32_KEY
7369.0,SMITH,CLERK,7902.0,17-12-1980,800.0,,20.0,0,0d7f9017fbda691900187b22404b8a1f,3163315632.0
7499.0,ALLEN,SALESMAN,7698.0,20-02-1981,1600.0,300.0,30.0,1,7a2b33c672ce223b2aa5789171ddde2f,1046173690.0
7521.0,WARD,SALESMAN,7698.0,22-02-1981,1250.0,500.0,30.0,2,e1e1f667ce4596e5644be6fab627c226,3535170612.0
7566.0,JONES,MANAGER,7839.0,04-02-1981,2975.0,,20.0,3,b937384a573b94c4d7cc6004c496f919,683555987.0
7654.0,MARTIN,SALESMAN,7698.0,21-09-1981,1250.0,1400.0,30.0,4,e2a7555f7cabd6e31aef45cb8cda4999,4024152101.0
7698.0,SGR,MANAGER,7839.0,05-01-1981,2850.0,,30.0,5,c570c225d1fb8a72ad79995dd17a77bc,1255715586.0
7782.0,RAVI,MANAGER,7839.0,06-09-1981,2450.0,,10.0,6,ac5c482277858d6fe45065d0a3f92b0c,3000238442.0
7788.0,SCOTT,ANALYST,7566.0,19-04-1987,3000.0,,20.0,7,866c7ee013c58f01fa153a8d32c9ed57,1375856756.0
7839.0,KING,PRESIDENT,,01-11-1981,5000.0,,10.0,8,ca91c5464e73d3066825362c3093a45f,3450750484.0
7844.0,TURNER,SALESMAN,7698.0,09-08-1981,1500.0,0.0,30.0,9,b356e7aed7ee82589e54a466e0dca157,4234062958.0


### Using `sha2`

In [0]:
from pyspark.sql.functions import sha2

# Creating new column as partition_id using md5() function
df = df.withColumn("SHA2_KEY",sha2(col("EMPNO").cast("string"),256))
display(df)

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,ID_KEY,MD5_KEY,CRC32_KEY,SHA2_KEY
7369.0,SMITH,CLERK,7902.0,17-12-1980,800.0,,20.0,0,0d7f9017fbda691900187b22404b8a1f,3163315632.0,c59f438f16c5a409eb2a040b299e82de37503321b9cbfec4fb351547261dd1b1
7499.0,ALLEN,SALESMAN,7698.0,20-02-1981,1600.0,300.0,30.0,1,7a2b33c672ce223b2aa5789171ddde2f,1046173690.0,4427dc2e32a1d099dbe2e3c093a8726e0ea72b9422c36a99915dae1d31e9385f
7521.0,WARD,SALESMAN,7698.0,22-02-1981,1250.0,500.0,30.0,2,e1e1f667ce4596e5644be6fab627c226,3535170612.0,74ed8ca63e8b4fb8b8ac06e8df400f098b0b09cf1b89c8a331e72e1919b57bd4
7566.0,JONES,MANAGER,7839.0,04-02-1981,2975.0,,20.0,3,b937384a573b94c4d7cc6004c496f919,683555987.0,b2ca4f93866dc5f5aa73f9df7a8c8d7ce03d296a3ccb5eec5fc28717f2de4874
7654.0,MARTIN,SALESMAN,7698.0,21-09-1981,1250.0,1400.0,30.0,4,e2a7555f7cabd6e31aef45cb8cda4999,4024152101.0,b969b01c158ebfecd0ac568aee526cb1ff8811fcdc77b4d682dab89146ad9891
7698.0,SGR,MANAGER,7839.0,05-01-1981,2850.0,,30.0,5,c570c225d1fb8a72ad79995dd17a77bc,1255715586.0,83f9d8d707524a4f56447204e344a997fff07b23b11f206eb5d02eac91385251
7782.0,RAVI,MANAGER,7839.0,06-09-1981,2450.0,,10.0,6,ac5c482277858d6fe45065d0a3f92b0c,3000238442.0,d7b6fab9aa91943de418ecbacefa4b276e82fbbb07bae1f7296775cc59a6f323
7788.0,SCOTT,ANALYST,7566.0,19-04-1987,3000.0,,20.0,7,866c7ee013c58f01fa153a8d32c9ed57,1375856756.0,16740bf13991fe083fbe5820cc8da08a5d88e5a48f44a3cfcc283c27b2797ba7
7839.0,KING,PRESIDENT,,01-11-1981,5000.0,,10.0,8,ca91c5464e73d3066825362c3093a45f,3450750484.0,cc87d27285025584dc9cf888e8b1f415eca13f64bc04b9acbdc04259e9f27864
7844.0,TURNER,SALESMAN,7698.0,09-08-1981,1500.0,0.0,30.0,9,b356e7aed7ee82589e54a466e0dca157,4234062958.0,b513a7ff5978b95883e4e6f83d4d991de3b8a22827d2f5bc58986e3cee03c6b1


#### Using window function `row_number()`

In [0]:
from pyspark.sql.functions import sha2,row_number,lit
from pyspark.sql.window import Window

# Creating new column as partition_id using md5() function
df =df.withColumn("ROW_NUMBER",row_number().over(Window.partitionBy(lit('')).orderBy(lit(''))))
display(df)

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,ID_KEY,MD5_KEY,CRC32_KEY,SHA2_KEY,ROW_NUMBER
7369.0,SMITH,CLERK,7902.0,17-12-1980,800.0,,20.0,0,0d7f9017fbda691900187b22404b8a1f,3163315632.0,c59f438f16c5a409eb2a040b299e82de37503321b9cbfec4fb351547261dd1b1,1
7499.0,ALLEN,SALESMAN,7698.0,20-02-1981,1600.0,300.0,30.0,1,7a2b33c672ce223b2aa5789171ddde2f,1046173690.0,4427dc2e32a1d099dbe2e3c093a8726e0ea72b9422c36a99915dae1d31e9385f,2
7521.0,WARD,SALESMAN,7698.0,22-02-1981,1250.0,500.0,30.0,2,e1e1f667ce4596e5644be6fab627c226,3535170612.0,74ed8ca63e8b4fb8b8ac06e8df400f098b0b09cf1b89c8a331e72e1919b57bd4,3
7566.0,JONES,MANAGER,7839.0,04-02-1981,2975.0,,20.0,3,b937384a573b94c4d7cc6004c496f919,683555987.0,b2ca4f93866dc5f5aa73f9df7a8c8d7ce03d296a3ccb5eec5fc28717f2de4874,4
7654.0,MARTIN,SALESMAN,7698.0,21-09-1981,1250.0,1400.0,30.0,4,e2a7555f7cabd6e31aef45cb8cda4999,4024152101.0,b969b01c158ebfecd0ac568aee526cb1ff8811fcdc77b4d682dab89146ad9891,5
7698.0,SGR,MANAGER,7839.0,05-01-1981,2850.0,,30.0,5,c570c225d1fb8a72ad79995dd17a77bc,1255715586.0,83f9d8d707524a4f56447204e344a997fff07b23b11f206eb5d02eac91385251,6
7782.0,RAVI,MANAGER,7839.0,06-09-1981,2450.0,,10.0,6,ac5c482277858d6fe45065d0a3f92b0c,3000238442.0,d7b6fab9aa91943de418ecbacefa4b276e82fbbb07bae1f7296775cc59a6f323,7
7788.0,SCOTT,ANALYST,7566.0,19-04-1987,3000.0,,20.0,7,866c7ee013c58f01fa153a8d32c9ed57,1375856756.0,16740bf13991fe083fbe5820cc8da08a5d88e5a48f44a3cfcc283c27b2797ba7,8
7839.0,KING,PRESIDENT,,01-11-1981,5000.0,,10.0,8,ca91c5464e73d3066825362c3093a45f,3450750484.0,cc87d27285025584dc9cf888e8b1f415eca13f64bc04b9acbdc04259e9f27864,9
7844.0,TURNER,SALESMAN,7698.0,09-08-1981,1500.0,0.0,30.0,9,b356e7aed7ee82589e54a466e0dca157,4234062958.0,b513a7ff5978b95883e4e6f83d4d991de3b8a22827d2f5bc58986e3cee03c6b1,10
