In [0]:
// /user/rr4577_nyu_edu/amazon-clean.parquet


In [1]:
val schema = "rating DOUBLE, category STRING, date DATE"

In [2]:
// Reading the First Parquet files:
val cleanDF = spark.read.schema(schema).parquet("/user/rr4577_nyu_edu/amazon-clean.parquet")

In [3]:
val mycleanedDF = spark.read.schema(schema).parquet("/user/sc10670_nyu_edu/project/amazon-clean.parquet")

In [4]:
val combinedDF = cleanDF.union(mycleanedDF)

In [5]:
// Remove rows where the category is "Unknown"
val filteredDF = combinedDF.filter(col("category") =!= "Unknown")

In [6]:
z.show(filteredDF)

In [7]:
z.show(filteredDF.summary())

In [8]:
import org.apache.spark.sql.functions._

// Step 1: Filter the DataFrame for "Gift Cards" and convert the date column
val giftCardsDF = filteredDF.filter(col("category") === "Gift Cards")
  .withColumn("review_date", to_date(col("date")))

// Step 2: Filter for reviews from the last 5 years
val last5YearsDF = giftCardsDF.filter(col("review_date").between(date_add(current_date(), -365 * 5), current_date()))

// Step 3: Add year and week columns
val weeklyReviewsDF = last5YearsDF
  .withColumn("year", year(col("review_date")))
  .withColumn("week", weekofyear(col("review_date")))

// Step 4: Group by year and week, and count the reviews
val weeklyReviewCounts = weeklyReviewsDF.groupBy("year", "week")
  .agg(count("*").alias("review_count"))
  .orderBy("year", "week")

// Step 5: Display the results
z.show(weeklyReviewCounts)

In [9]:
val dowDF = filteredDF.withColumn("rating_dow", dayofweek(col("date")))

In [10]:
import org.apache.spark.sql.functions._

// Filter data for years after 2018
val filteredDF = combinedDF.filter(year(to_date(col("date"), "yyyy-MM-dd")) > 2018)

// Extract year and month
val dfWithYearMonth = filteredDF
  .withColumn("year", year(to_date(col("date"), "yyyy-MM-dd")))
  .withColumn("month", month(to_date(col("date"), "yyyy-MM-dd")))

// Group by year and month, and count the number of ratings
val ratingsByMonth = dfWithYearMonth.groupBy("year", "month")
  .agg(count("rating").alias("total_ratings"))

// Calculate the average number of ratings for each month across all years
val avgRatingsByMonth = ratingsByMonth.groupBy("month")
  .agg(avg("total_ratings").alias("average_ratings"))
  .orderBy("month")

// Show the result
z.show(avgRatingsByMonth)


In [11]:
z.show(dowDF.limit(10))

In [12]:
val ratingDayDF = dowDF.groupBy("rating_dow").agg(count("rating_dow") as "rating_count")

In [13]:
z.show(ratingDayDF)

In [14]:
val categoryRatingsDF = dowDF.groupBy("rating_dow", "category").agg(count("rating_dow") as "rating_count")

In [15]:
z.show(categoryRatingsDF)

In [16]:
filteredDF.show()

In [17]:
val ratingDistributionDF = filteredDF.groupBy("rating").count()
z.show(ratingDistributionDF)

In [18]:
//Top Categories by rating count
val topCategoriesDF = filteredDF.groupBy("category").agg(count("rating") as "rating_count").orderBy(desc("rating_count"))
z.show(topCategoriesDF)

In [19]:
// Trend of Ratings Over Time in Reverse Order of Year
val ratingsTrendDF = filteredDF.withColumn("year", year(col("date")))
                               .groupBy("year")
                               .agg(count("rating").alias("rating_count"))
                               .orderBy(desc("year")) // Order by year in descending order

z.show(ratingsTrendDF)


In [20]:
//Average Ratings per category
val avgRatingDF = filteredDF.groupBy("category").agg(avg("rating") as "avg_rating")
z.show(avgRatingDF)

In [21]:
//Seasonal Ratings(Month wise analysis)
val monthWiseDF = filteredDF.withColumn("month", month(col("date")))
                            .groupBy("month")
                            .agg(count("rating") as "rating_count")
z.show(monthWiseDF)


In [22]:
import org.apache.spark.sql.functions._

// Group by category and count the total number of ratings
val totalRatingsByCategory = combinedDF.groupBy("category")
  .agg(count("rating").alias("total_ratings"))
  .orderBy(desc("total_ratings")) // Order by total ratings in descending order

// Show the result
z.show(totalRatingsByCategory)


In [23]:
//HeatMaps of Ratings by Day of week and Month
val dowMonthDF = filteredDF.withColumn("day_of_week", dayofweek(col("date")))
                           .withColumn("month", month(col("date")))
                           .groupBy("day_of_week", "month")
                           .agg(count("rating") as "rating_count")
z.show(dowMonthDF)


In [24]:
// NOW WORKING ON THE WALMART DATA HERE ON AS WELL
///user/sc10670_nyu_edu/project/cleaned-walmart-data.parquet
val walmartDF = spark.read.parquet("project/cleaned-walmart-data.parquet")
walmartDF.show()

In [25]:
// Day Wise Statistics:
// Add a new column for day of the week extracted from 'order_date'
val walmartDFWithDay = walmartDF.withColumn("day_of_week", date_format(to_date(col("order_date"), "yyyy-MM-dd"), "EEEE"))

// Group data by day of the week and calculate trends
// For example, calculate total orders, total profit, and average shipping cost by day of the week
val trendsByDay = walmartDFWithDay.groupBy("day_of_week")
  .agg(
    count("*").alias("total_orders"),
    sum("profit").alias("total_profit"),
    avg("shipping_cost").alias("average_shipping_cost"),
    sum("order_quantity").alias("total_quantity")
  )
  .orderBy("day_of_week") // Sort by day of the week

// Show the results
z.show(trendsByDay)

In [26]:
// Number of Unique Cities per State
val stateCitiesDF = walmartDF.groupBy("state")
  .agg(countDistinct("city").alias("number_of_unique_cities"))
  .orderBy(desc("number_of_unique_cities"))
  .limit(10)

In [27]:
z.show(stateCitiesDF)

In [28]:
//1. Customer Insights
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

In [29]:
//Age Group Analysis
import org.apache.spark.sql.functions._

val ageGroupDF = walmartDF
  .withColumn("age_group", when(col("customer_age").cast("int").between(0, 20), "0-20")
    .when(col("customer_age").cast("int").between(21, 40), "21-40")
    .when(col("customer_age").cast("int").between(41, 60), "41-60")
    .otherwise("60+"))
  .groupBy("age_group")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit" // Corrected casting
  )
z.show(ageGroupDF)


In [30]:
val segmentTrendsDF = walmartDF
  .groupBy("customer_segment")
  .agg(sum("order_quantity") as "total_order_quantity", sum(col("profit").cast("double")) as "total_profit")
z.show(segmentTrendsDF)


In [31]:
// GEOGRAPHIC INSIGHTS
// State-Wise Performance Ordered by Total Order Quantity (Top 10)
val statePerformanceDF = walmartDF
  .groupBy("state")
  .agg(
    sum("order_quantity").alias("total_order_quantity"),
    sum(col("profit").cast("double")).alias("total_profit"),
    countDistinct("city").alias("unique_customers")
  )
  .orderBy(desc("total_order_quantity")) // Order by total order quantity in descending order
  .limit(10) // Show only the top 10 states

z.show(statePerformanceDF)

In [32]:
val cityTrendsDF = walmartDF
  .groupBy("city")
  .agg(
    sum(col("profit").cast("double")) as "total_profit"
  )
  .orderBy(desc("total_profit"))
  .limit(100)

z.show(cityTrendsDF)


In [33]:
val regionPerformanceDF = walmartDF
  .groupBy("region")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(regionPerformanceDF)


In [34]:
//3. Sales and Profitability


In [35]:
//Monthly Trends
val monthlyTrendsDF = walmartDF
  .withColumn("month", month(col("order_date")))
  .groupBy("month")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(monthlyTrendsDF)


In [36]:
//Discounts Impact on Profit
val discountProfitCorrelationDF = walmartDF
  .select(
    corr(col("discount").cast("double"), col("profit").cast("double")) as "correlation_discount_profit"
  )

z.show(discountProfitCorrelationDF)


In [37]:
//YoY Growth 
val yoyGrowthDF = walmartDF
  .withColumn("year", year(col("order_date")))
  .groupBy("year")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(yoyGrowthDF)


In [38]:
//Seasonal Trends
val quarterlyTrendsDF = walmartDF
  .withColumn("quarter", quarter(col("order_date")))
  .groupBy("quarter")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(quarterlyTrendsDF)


In [39]:
//Weekday vs Weekend
val dayOfWeekDF = walmartDF
  .withColumn("day_of_week", date_format(col("order_date"), "E"))
  .withColumn("is_weekend", when(col("day_of_week").isin("Sat", "Sun"), "Weekend").otherwise("Weekday"))
  .groupBy("is_weekend")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(dayOfWeekDF)


In [40]:
//Cross Dimensional Insights
//Region Segment Trends
val regionSegmentTrendsDF = walmartDF
  .groupBy("region", "customer_segment")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(regionSegmentTrendsDF)


In [41]:
// Age vs Region Trends
val ageRegionTrendsDF = walmartDF
  .withColumn("age_group", when(col("customer_age").cast("int").between(0, 20), "0-20")
    .when(col("customer_age").cast("int").between(21, 40), "21-40")
    .when(col("customer_age").cast("int").between(41, 60), "41-60")
    .otherwise("60+"))
  .groupBy("region", "age_group")
  .agg(
    sum("order_quantity") as "total_order_quantity",
    sum(col("profit").cast("double")) as "total_profit"
  )

z.show(ageRegionTrendsDF)
