In [0]:
import requests

In [0]:
url = "https://raw.githubusercontent.com/ruch0401/public-endpoints/refs/heads/main/website_reviews.csv"
response_website = requests.get(url)

csv_content = response_website.text
dbutils.fs.put("dbfs:/FileStore/website_review.csv", csv_content, overwrite=True)

Wrote 71862 bytes.
Out[2]: True

In [0]:
url = "https://raw.githubusercontent.com/ruch0401/public-endpoints/refs/heads/main/social_media.json"
response_social_media = requests.get(url)

json_content = response_social_media.text
dbutils.fs.put("dbfs:/FileStore/social_media.json", json_content, overwrite=True)

Wrote 119003 bytes.
Out[3]: True

In [0]:
# URL of the raw JSON file
url = "https://raw.githubusercontent.com/ruch0401/public-endpoints/refs/heads/main/support_tickets.json"
response_support_ticket = requests.get(url)

json_content = response_support_ticket.text
dbutils.fs.put("dbfs:/FileStore/support_ticket.json", json_content, overwrite=True)

Wrote 154645 bytes.
Out[4]: True

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("FeedbackPipeline").getOrCreate()

In [0]:
df_website = spark.read.option("header", "true").csv("/FileStore/website_review.csv")
df_social_media = spark.read.option("multiline", "true").json("/FileStore/social_media.json")
df_support_ticket = spark.read.option("multiline", "true").json("/FileStore/support_ticket.json")

In [0]:
website_reviews = df_website.count()
print(f"Website reviews: {website_reviews}")

social_media_reviews = df_social_media.count()
print(f"Social Media reviews: {social_media_reviews}")

support_ticket = df_support_ticket.count()
print(f"Support ticket : {support_ticket}")

Website reviews: 500
Social Media reviews: 500
Support ticket : 500


In [0]:
df_website = (df_website
              .withColumnRenamed("review_id", "feedback_id")
              .withColumnRenamed("review", "feedback_text")
              .withColumn("feedback_on", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss"))
              .withColumnRenamed("customer_id","user_id")
              .withColumn("source", lit("Website Review"))
              .withColumn("user_name", lit(None))
              .withColumn("user_email", lit(None))
              .withColumn("platform", lit("Website name"))
)
df_website.display()

feedback_id,user_id,rating,feedback_text,timestamp,feedback_on,source,user_name,user_email,platform
R0000,9fb1e99f-404a-4794-9b43-baf741d35d5b,2,Thus wrong operation bad party other energy economy these hundred Mr.,2025-03-23T02:25:16,2025-03-23T02:25:16.000+0000,Website Review,,,Website name
R0001,53eb38dc-d17f-4463-a322-5a26d5a9229b,1,You seat picture return different door hot stop moment.,2025-01-25T18:11:45,2025-01-25T18:11:45.000+0000,Website Review,,,Website name
R0002,23104eff-cdce-436d-aaf7-cb8174451fa3,1,Road watch much improve about increase world middle myself low career.,2025-03-25T06:23:36,2025-03-25T06:23:36.000+0000,Website Review,,,Website name
R0003,db7a1114-6810-4cea-82f6-9cea23bb17a5,2,Just against source blue other sure theory me idea.,2025-03-16T10:45:34,2025-03-16T10:45:34.000+0000,Website Review,,,Website name
R0004,765cc8b1-cbfe-459e-868f-76daa015319b,3,Baby small water young fish fine table scene difficult.,2025-04-05T08:46:10,2025-04-05T08:46:10.000+0000,Website Review,,,Website name
R0005,8c2bee92-a057-40e6-95b4-4fb7ac340126,1,Seem up push suffer final forward north address beat.,2025-01-18T23:10:32,2025-01-18T23:10:32.000+0000,Website Review,,,Website name
R0006,a1bdf10f-bc3f-489a-b04a-c09bba5f505a,2,Police system white begin everyone floor morning family same poor cell character.,2025-02-12T00:15:29,2025-02-12T00:15:29.000+0000,Website Review,,,Website name
R0007,d32dd2c4-7afb-4b91-8f86-cc8f73e2c60c,1,Condition customer end capital American perform company design policy throw war school form.,2025-03-27T07:35:25,2025-03-27T07:35:25.000+0000,Website Review,,,Website name
R0008,8e0eacab-6525-4f22-a3e4-de1234fb6e02,4,Population kitchen law Republican fall pass almost spring like mention expect.,2025-03-31T08:30:49,2025-03-31T08:30:49.000+0000,Website Review,,,Website name
R0009,db342837-35f4-4746-a8cd-27f018de682e,3,After score situation born real town worker try use result enter bill summer pattern most break.,2025-04-02T11:19:24,2025-04-02T11:19:24.000+0000,Website Review,,,Website name


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Define the window
window_spec = Window.partitionBy("user_id").orderBy(df_website["feedback_on"].desc())

# Add a row number: 1 = latest record
df_website = df_website.withColumn("row_num", row_number().over(window_spec)) \
             .filter("row_num == 1") \
             .drop("row_num")

df_website.count()

Out[9]: 500

In [0]:
df_social_media = (df_social_media
                   .withColumnRenamed("id", "feedback_id")
                   .withColumnRenamed("content", "feedback_text")
                   .withColumn("feedback_on", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss"))
                   .withColumnRenamed("user","user_name")
                   .withColumn("source", lit("Social Media Review"))
                   .withColumn("user_id",lit(None))
                   .withColumn("user_email", lit(None))
                   )
df_social_media.display()

feedback_text,feedback_id,platform,timestamp,user_name,feedback_on,source,user_id,user_email
Rate focus people participant face song. Firm difference forget with but peace quickly.,S0000,Facebook,2025-01-01T09:22:05,jmorris,2025-01-01T09:22:05.000+0000,Social Media Review,,
Artist happy officer after nothing part discuss. Than least study glass only.,S0001,Twitter,2025-01-11T00:09:20,laura00,2025-01-11T00:09:20.000+0000,Social Media Review,,
Scene here American although yeah. Father onto operation green weight. Expect yourself quality able.,S0002,Instagram,2025-03-23T03:21:06,brownmitchell,2025-03-23T03:21:06.000+0000,Social Media Review,,
Great manager almost require always tell hit blue.,S0003,Facebook,2025-04-04T15:58:27,nicholas45,2025-04-04T15:58:27.000+0000,Social Media Review,,
Decide many fire onto culture deep building pick. Mr among network. Director art approach hope trip happy thank.,S0004,Instagram,2025-04-10T09:44:19,anthonyvelasquez,2025-04-10T09:44:19.000+0000,Social Media Review,,
Total any soon may respond debate. Mention local stop for final beyond across. Could big eat they.,S0005,Facebook,2025-03-20T14:13:48,fosterjoe,2025-03-20T14:13:48.000+0000,Social Media Review,,
Specific on whether pressure line. Choose bar skin off office paper.,S0006,Instagram,2025-04-09T13:55:52,hernandezjustin,2025-04-09T13:55:52.000+0000,Social Media Review,,
Toward receive dog skill.,S0007,Twitter,2025-03-31T03:06:58,scottevelyn,2025-03-31T03:06:58.000+0000,Social Media Review,,
Environment senior training. Well year set upon wear.,S0008,Facebook,2025-04-13T23:24:01,donaldmaldonado,2025-04-13T23:24:01.000+0000,Social Media Review,,
Science tax real suddenly visit hundred. Run through light. Argue road save magazine.,S0009,Facebook,2025-02-26T08:14:22,heathermckee,2025-02-26T08:14:22.000+0000,Social Media Review,,


In [0]:
df_duplicates = df_social_media.groupBy("user_name") \
                  .count() \
                  .filter("count > 1")
df_duplicates.display()

user_name,count
rlewis,2


In [0]:
df_duplicates = df_duplicates.select("user_name")

# Step 2: Join back to original DataFrame to get full rows
df_duplicates = df_social_media.join(df_duplicates, on="user_name", how="inner")

# Step 3: Display
df_duplicates.display()

user_name,feedback_text,feedback_id,platform,timestamp,feedback_on,source,user_id,user_email
rlewis,Amount however only voice despite talk community. Know seven teacher series.,S0399,Instagram,2025-03-06T02:39:49,2025-03-06T02:39:49.000+0000,Social Media Review,,
rlewis,Production medical drug today later. Fill nature senior be sea. Mean his beyond ground require serious.,S0454,Facebook,2025-02-03T01:36:03,2025-02-03T01:36:03.000+0000,Social Media Review,,


In [0]:
# Define the window
window_spec = Window.partitionBy("user_name").orderBy(df_social_media["feedback_on"].desc())

# Add a row number: 1 = latest record
df_social_media = df_social_media.withColumn("row_num", row_number().over(window_spec)) \
             .filter("row_num == 1")\
             .drop("row_num")

df_social_media.count()


Out[13]: 499

In [0]:
df_support_ticket = (df_support_ticket
                     .withColumnRenamed("ticket_id", "feedback_id")
                     .withColumnRenamed("body", "feedback_text")
                     .withColumn("feedback_on", to_timestamp("created_at", "yyyy-MM-dd'T'HH:mm:ss"))
                     .withColumnRenamed("customer_email","user_email")
                     .withColumn("source", lit("Support Ticket Review"))
                     .withColumn("user_id", lit(None))
                     .withColumn("user_name", lit(None))
                     .withColumn("platform",lit("Support")))
df_support_ticket.display()

feedback_text,created_at,user_email,status,subject,feedback_id,feedback_on,source,user_id,user_name,platform
Opportunity meeting last would when simple forward direction. Cost consumer effect. West author hope tax several item.,2025-02-19T19:30:54,hnicholson@howell.biz,open,Ten ever arm find soon feeling box.,T0000,2025-02-19T19:30:54.000+0000,Support Ticket Review,,,Support
Owner attention person ever yard process. Would pass best edge. As on well read.,2025-03-28T15:57:16,zhayes@reynolds.com,pending,Station theory production yeah resource single impact.,T0001,2025-03-28T15:57:16.000+0000,Support Ticket Review,,,Support
Early floor project church address century court. Fight evening possible. Hold whom stuff.,2025-01-28T08:16:31,zfischer@anderson-haynes.info,open,Civil worker both single exist.,T0002,2025-01-28T08:16:31.000+0000,Support Ticket Review,,,Support
Wish soldier since music. Left great pass thus last.,2025-01-05T13:49:08,amber27@butler.com,closed,Enjoy realize image marriage mouth public.,T0003,2025-01-05T13:49:08.000+0000,Support Ticket Review,,,Support
International author finish. Technology different politics source. Fact sense every pass.,2025-02-24T21:11:13,hendrixpamela@miller.org,pending,Catch design some debate first easy easy.,T0004,2025-02-24T21:11:13.000+0000,Support Ticket Review,,,Support
War soldier tax continue fish west. Law the matter American represent newspaper.,2025-01-25T01:57:25,boydedward@williams-jefferson.biz,closed,Economy yes detail series yourself view.,T0005,2025-01-25T01:57:25.000+0000,Support Ticket Review,,,Support
Recognize measure life attack reduce painting. Represent authority understand career spend protect learn. Minute how green sing sport security store. Capital manager wait daughter local by our.,2025-03-28T22:35:59,gibbschristian@yahoo.com,pending,Perhaps down treatment information.,T0006,2025-03-28T22:35:59.000+0000,Support Ticket Review,,,Support
Take data thus serve us increase quickly this.,2025-02-28T13:16:41,patricia21@dawson-gordon.com,pending,What per establish us.,T0007,2025-02-28T13:16:41.000+0000,Support Ticket Review,,,Support
Least receive realize impact relationship tonight. Run writer hospital. Congress candidate mind guess.,2025-01-16T07:55:21,ygonzalez@hotmail.com,closed,Security page able form decision why school.,T0008,2025-01-16T07:55:21.000+0000,Support Ticket Review,,,Support
Into method him. Second them suffer pay.,2025-01-19T21:42:37,pmurphy@gmail.com,closed,Out his thought possible.,T0009,2025-01-19T21:42:37.000+0000,Support Ticket Review,,,Support


In [0]:
# Define the window
window_spec = Window.partitionBy("user_email").orderBy(df_support_ticket["feedback_on"].desc())

# Add a row number: 1 = latest record
df_support_ticket = df_support_ticket.withColumn("row_num", row_number().over(window_spec)) \
             .filter("row_num == 1") \
             .drop("row_num")

df_support_ticket.count()

Out[15]: 500

In [0]:
df_website = df_website.select(
    "feedback_id", "user_id", "user_email", "user_name", "feedback_text", "source", "platform", "feedback_on"
)
df_social_media = df_social_media.select(
    "feedback_id", "user_id", "user_email", "user_name", "feedback_text", "source", "platform", "feedback_on"
)
df_support_ticket = df_support_ticket.select(
    "feedback_id", "user_id", "user_email", "user_name", "feedback_text", "source", "platform", "feedback_on"
)

In [0]:
df_feedback = df_website.union(df_social_media).union(df_support_ticket)
df_feedback.display()

feedback_id,user_id,user_email,user_name,feedback_text,source,platform,feedback_on
R0243,00dc580e-7deb-46b8-bfb8-e73998b6bc68,,,Loss various debate level fine choose good ball might against threat language less.,Website Review,Website name,2025-03-25T10:06:34.000+0000
R0437,00e57abe-fbba-4f5b-b5fb-122d7bef5988,,,North note create machine significant light drop standard risk shoulder radio hope table key.,Website Review,Website name,2025-02-22T14:01:12.000+0000
R0206,037551b2-cb57-4df8-a7ee-fdaff2f23a67,,,Financial realize describe country yourself we use window collection woman ability right toward true rich ago.,Website Review,Website name,2025-04-14T22:42:58.000+0000
R0225,03a993a9-a15c-49cd-ac1a-4426b6c55800,,,Nothing without look within modern town court offer woman popular store let right change whatever.,Website Review,Website name,2025-01-17T09:24:22.000+0000
R0480,03d58f74-acaa-449b-8645-ba9639f5a080,,,Natural finally money another whether miss against market particularly politics star stuff.,Website Review,Website name,2025-03-14T08:42:58.000+0000
R0447,0497436b-e42e-446e-a2f4-db84dfa100f3,,,Organization institution least threat develop small again now upon run partner.,Website Review,Website name,2025-01-17T07:55:53.000+0000
R0123,04b320f3-4903-4973-8418-698733897581,,,Claim arrive major action member start everything power economy account wide.,Website Review,Website name,2025-01-19T11:49:51.000+0000
R0192,04bbe6cd-fa80-4abf-a3d3-adda320c149b,,,Three week compare respond perhaps feeling eye indeed find economic should professional including season history.,Website Review,Website name,2025-02-15T03:04:35.000+0000
R0404,0589a574-e61f-4766-b58c-2e86921a7c29,,,Compare prepare idea effort necessary play summer kid society behavior everyone sell.,Website Review,Website name,2025-03-08T22:43:35.000+0000
R0311,0601dbab-414e-4214-8a9c-39b5d0c372b0,,,Laugh crime ever think list hot cultural although near model.,Website Review,Website name,2025-02-04T04:15:27.000+0000


In [0]:
df_feedback.dropDuplicates(["feedback_id"])
df_feedback.display()

feedback_id,user_id,user_email,user_name,feedback_text,source,platform,feedback_on
R0243,00dc580e-7deb-46b8-bfb8-e73998b6bc68,,,Loss various debate level fine choose good ball might against threat language less.,Website Review,Website name,2025-03-25T10:06:34.000+0000
R0437,00e57abe-fbba-4f5b-b5fb-122d7bef5988,,,North note create machine significant light drop standard risk shoulder radio hope table key.,Website Review,Website name,2025-02-22T14:01:12.000+0000
R0206,037551b2-cb57-4df8-a7ee-fdaff2f23a67,,,Financial realize describe country yourself we use window collection woman ability right toward true rich ago.,Website Review,Website name,2025-04-14T22:42:58.000+0000
R0225,03a993a9-a15c-49cd-ac1a-4426b6c55800,,,Nothing without look within modern town court offer woman popular store let right change whatever.,Website Review,Website name,2025-01-17T09:24:22.000+0000
R0480,03d58f74-acaa-449b-8645-ba9639f5a080,,,Natural finally money another whether miss against market particularly politics star stuff.,Website Review,Website name,2025-03-14T08:42:58.000+0000
R0447,0497436b-e42e-446e-a2f4-db84dfa100f3,,,Organization institution least threat develop small again now upon run partner.,Website Review,Website name,2025-01-17T07:55:53.000+0000
R0123,04b320f3-4903-4973-8418-698733897581,,,Claim arrive major action member start everything power economy account wide.,Website Review,Website name,2025-01-19T11:49:51.000+0000
R0192,04bbe6cd-fa80-4abf-a3d3-adda320c149b,,,Three week compare respond perhaps feeling eye indeed find economic should professional including season history.,Website Review,Website name,2025-02-15T03:04:35.000+0000
R0404,0589a574-e61f-4766-b58c-2e86921a7c29,,,Compare prepare idea effort necessary play summer kid society behavior everyone sell.,Website Review,Website name,2025-03-08T22:43:35.000+0000
R0311,0601dbab-414e-4214-8a9c-39b5d0c372b0,,,Laugh crime ever think list hot cultural although near model.,Website Review,Website name,2025-02-04T04:15:27.000+0000


In [0]:
window_spec = Window.partitionBy("feedback_text").orderBy(df_feedback["feedback_on"].desc())

# Add a row number: 1 = latest record
df_feedback = df_feedback.withColumn("row_num", row_number().over(window_spec)) \
             .filter("row_num == 1") \
             .drop("row_num")

df_feedback.count()

Out[19]: 1499

In [0]:
df_feedback.filter(col("feedback_on").isNotNull()).count()

Out[20]: 1499

In [0]:
df_feedback.write.mode("overwrite").partitionBy("source").parquet("/FileStore/feedback_data/")



In [0]:
display(dbutils.fs.ls("/FileStore/feedback_data/"))

path,name,size,modificationTime
dbfs:/FileStore/feedback_data/_SUCCESS,_SUCCESS,0,1745863655000
dbfs:/FileStore/feedback_data/_committed_8032823988985346839,_committed_8032823988985346839,35,1745782198000
dbfs:/FileStore/feedback_data/source=Social Media Review/,source=Social Media Review/,0,0
dbfs:/FileStore/feedback_data/source=Support Ticket Review/,source=Support Ticket Review/,0,0
dbfs:/FileStore/feedback_data/source=Website Review/,source=Website Review/,0,0


In [0]:
display(dbutils.fs.ls("/FileStore/feedback_data/source=Social Media Review/"))

path,name,size,modificationTime
dbfs:/FileStore/feedback_data/source=Social Media Review/_committed_4161119589937171557,_committed_4161119589937171557,223,1745782758000
dbfs:/FileStore/feedback_data/source=Social Media Review/_committed_466484174487865200,_committed_466484174487865200,220,1745863655000
dbfs:/FileStore/feedback_data/source=Social Media Review/_committed_7622164897269266821,_committed_7622164897269266821,223,1745782612000
dbfs:/FileStore/feedback_data/source=Social Media Review/_committed_8032823988985346839,_committed_8032823988985346839,233,1745782198000
dbfs:/FileStore/feedback_data/source=Social Media Review/_committed_842182825592865612,_committed_842182825592865612,222,1745783647000
dbfs:/FileStore/feedback_data/source=Social Media Review/_started_466484174487865200,_started_466484174487865200,0,1745863652000
dbfs:/FileStore/feedback_data/source=Social Media Review/part-00000-tid-466484174487865200-8af8bc81-ad7a-4d4f-90b2-439e3759e232-60-1.c000.snappy.parquet,part-00000-tid-466484174487865200-8af8bc81-ad7a-4d4f-90b2-439e3759e232-60-1.c000.snappy.parquet,45536,1745863654000


In [0]:
df_parquet_Social_media = spark.read.parquet("/FileStore/feedback_data/source=Social Media Review/part-00000-tid-466484174487865200-8af8bc81-ad7a-4d4f-90b2-439e3759e232-60-1.c000.snappy.parquet")
df_parquet_Social_media.show()

+-----------+-------+----------+---------------+--------------------+---------+-------------------+
|feedback_id|user_id|user_email|      user_name|       feedback_text| platform|        feedback_on|
+-----------+-------+----------+---------------+--------------------+---------+-------------------+
|      S0288|   null|      null|        amber56|Able a within blu...|  Twitter|2025-01-16 03:51:40|
|      S0343|   null|      null|        ehughes|Able draw police....|  Twitter|2025-01-05 21:04:33|
|      S0218|   null|      null|        iduncan|Above listen I ge...|  Twitter|2025-02-04 22:43:54|
|      S0408|   null|      null| humphreywesley|Accept drop memor...|  Twitter|2025-03-26 18:57:38|
|      S0443|   null|      null|       mwheeler|According cut ste...|  Twitter|2025-04-07 07:10:32|
|      S0187|   null|      null|  douglascooper|According meet mo...|Instagram|2025-02-21 04:05:24|
|      S0086|   null|      null| donaldgarrison|Across difficult ...| Facebook|2025-02-21 23:29:47|


In [0]:
df_parquet_Social_media.count()

Out[26]: 499