In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *

In [None]:


data = [(1,'there is 20kg sugar in bag'),
       (2,'My birth date is 14-04-2001'),
       (3,'this is link of youtube https://www.youtube.com/'),
       (4,'{Des={age=20, salary=15000}}')]

# schema = StructType([\
#                      StructField('id', StringType(), True),
#                      StructField('text', StringType(), True)
#                     ])

df3 = spark.createDataFrame(data=data, schema=['id', 'text'])
df3.display()

id,text
1,there is 20kg sugar in bag
2,My birth date is 14-04-2001
3,this is link of youtube https://www.youtube.com/
4,"{Des={age=20, salary=15000}}"


In [None]:
df3.withColumn('numbers', regexp_extract('text', '\d+', 0)).display()

id,text,numbers
1,there is 20kg sugar in bag,20.0
2,My birth date is 14-04-2001,14.0
3,this is link of youtube https://www.youtube.com/,
4,"{Des={age=20, salary=15000}}",20.0


In [None]:
df3.withColumn('date', regexp_extract('text', '\d\d-\d\d-\d\d\d\d', 0)).display()

id,text,date
1,there is 20kg sugar in bag,
2,My birth date is 14-04-2001,14-04-2001
3,this is link of youtube https://www.youtube.com/,
4,"{Des={age=20, salary=15000}}",


In [None]:
df3.withColumn('date', regexp_extract('text', '\d{2}-\d{2}-\d{4}', 0)).display()

id,text,date
1,there is 20kg sugar in bag,
2,My birth date is 14-04-2001,14-04-2001
3,this is link of youtube https://www.youtube.com/,
4,"{Des={age=20, salary=15000}}",


In [None]:
df3.withColumn('links', regexp_extract('text', 'http[s]?://.*', 0)).display()

id,text,links
1,there is 20kg sugar in bag,
2,My birth date is 14-04-2001,
3,this is link of youtube https://www.youtube.com/,https://www.youtube.com/
4,"{Des={age=20, salary=15000}}",


In [None]:
df3.withColumn('dict', regexp_replace(regexp_extract('text', '(?:Des=)(\{.+}[}|,]?)',1),
                                      '\{|\}|\s+','')).display()

id,text,dict
1,there is 20kg sugar in bag,
2,My birth date is 14-04-2001,
3,this is link of youtube https://www.youtube.com/,
4,"{Des={age=20, salary=15000}}","age=20,salary=15000"


In [None]:
df = df3.toPandas()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(df['text'])
df['TF-IDF'] = list(x.toarray())
df

Unnamed: 0,id,text,TF-IDF
0,1,there is 20kg sugar in bag,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.43003651715871155,..."
1,2,My birth date is 14-04-2001,"[0.39505606234957286, 0.39505606234957286, 0.0..."
2,3,this is link of youtube https://www.youtube.com/,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,"{Des={age=20, salary=15000}}","[0.0, 0.0, 0.4472135954999579, 0.4472135954999..."


In [None]:
df['TF-IDF'] = list(x.toarray())
df

Unnamed: 0,id,text,TF-IDF
0,1,there is 20kg sugar in bag,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.43003651715871155,..."
1,2,My birth date is 14-04-2001,"[0.39505606234957286, 0.39505606234957286, 0.0..."
2,3,this is link of youtube https://www.youtube.com/,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,"{Des={age=20, salary=15000}}","[0.0, 0.0, 0.4472135954999579, 0.4472135954999..."


In [None]:
vec = TfidfVectorizer()
df4 = df3.withColumn('TF-IDF',split('text', ' ')).display()


id,text,TF-IDF
1,there is 20kg sugar in bag,"List(there, is, 20kg, sugar, in, bag)"
2,My birth date is 14-04-2001,"List(My, birth, date, is, 14-04-2001)"
3,this is link of youtube https://www.youtube.com/,"List(this, is, link, of, youtube, https://www.youtube.com/)"
4,"{Des={age=20, salary=15000}}","List({Des={age=20,, salary=15000}})"


In [None]:
df3.groupBy("text").agg(countDistinct("id") as "ss")

In [None]:
tf = HashingTF()
df4 = df3.withColumn('text',unicode(, "utf-8")).display()

In [None]:
v = TfidfVectorizer()
def tfidf(text):
  return v.fit_transform(str('text'))

tf = udf(lambda x : tfidf(x))
df3.select(tf((col('text'))).alias(str("Splited Name"))).show(truncate=False)

In [None]:
df3.select(tf((col('text'))).alias(str("Splited Name"))).show(truncate=False)