In [0]:
spark.conf.set("spark.sql.caseSensitive", "true");

In [1]:
val mainDf = spark.read.json("/user/sa6142_nyu_edu/project/electronics/Electronics.json");

In [2]:
//Feature Selection
//Look into each column of mainDf and select the relevant fields
mainDf.columns


In [3]:
mainDf.printSchema

In [4]:
//check asin
print(f"No of Reviews: ${mainDf.count()}\n")
print(f"No of Distinct Products: ${mainDf.select("asin").distinct().count()}\n")

In [5]:
mainDf.show()

In [6]:
//udf to count null in a column
import org.apache.spark.sql.functions.{col,when, count}
def countColumns(columns:Array[String])={
    columns.map(c=>{
      count(when(col(c).isNull,c)).alias(c)
    })
}
mainDf.select(countColumns(mainDf.columns):_*).show()

In [7]:
//Based on the results of the above two paragraphs we can drop some of the features which are heavily null or which are irrelevant for our search

//asin: This is the productId and we will keep this field as it will be used to join with the meta df
//image: This is the images that the reviewer post but as we can see in the above paragraphs that it is mostly null. We will be dropping out this column
//overall: This is the overall rating that the reviewer provides to the product. We will keep this field as it will be useful in determining the ordering in the search query.
//reviewText: Review text contains lot more text than usually needed to gauge the sentiment of the reviewer. Thus we will be dropping this field and instead use Summary feature for the sentiment anaylsis.
//reviewTime: This Field is not needed for our search engine.
//reviewerID: This Field is not needed for our search engine.
//reviewerName: This Field is not needed for our search engine.
//style: This field is mostly null and non-null usually contains the categorization of the product, which we can easily get from the meta table so we will be dropping this feature.
//summary: This contains the summary of user's review. We will keep this field for sentiment analysis.
//unixReviewTime: We will drop this field as it is not relevant for our search engine.
//verified: We filtered out the unverified reviews and not longer need this field thus we are dropping this.
//vote: We are likely to use this to to assign weights to each review.

//Features selected from mainDf are:
//asin
//overall
//summary
//vote

In [8]:
//cast vote to int
//we will keep only the verified reviews and convert the vote colums to int and replace the null with 0
val newMainDf = mainDf.where(mainDf("verified")===true).select($"asin", $"overall", $"summary", $"vote".cast("int")).toDF
val filteredMainDf = newMainDf.na.fill(0)

In [9]:
//keep only the products that has more than 10 reviews as too few review may make the model biased
val countDf = filteredMainDf.groupBy($"asin").agg(count($"asin").as("num_review")).filter($"num_review" > 10).toDF
countDf.show()

In [10]:
val finalMainDf = filteredMainDf.join(countDf, Seq("asin"), "inner").filter($"num_review" > 10).drop($"num_review")


In [11]:
finalMainDf.show()

In [12]:
//Loading and Cleaning Meta data.
val metaDf = spark.read.json("/user/sa6142_nyu_edu/project/electronics/meta_Electronics.json");

In [13]:
//Feature Selection
//Look into each column of metaDf and select the relevant fields
metaDf.columns

In [14]:
metaDf.printSchema

In [15]:
print(f"No of Reviews: ${metaDf.count()}\n")
print(f"No of Distinct Products: ${metaDf.select("asin").distinct().count()}\n")

In [16]:
//Taking unique asin from metaDf as having duplicates in meta doesn't make sense
val uniqueMetaDf = metaDf.distinct().toDF()

In [17]:
uniqueMetaDf.show()

In [18]:
uniqueMetaDf.select(countColumns(metaDf.columns):_*).show()

In [19]:
//convert price from string to double
val newMetaDf = uniqueMetaDf.filter(substring_index($"price", "$", -1).cast("double").isNotNull).withColumn("new_price", substring_index($"price", "$", -1).cast("double")).drop($"price").withColumnRenamed("new_price", "price").drop("imageURL").withColumnRenamed("imageURLHighRes", "imageURI")


In [20]:
newMetaDf.show(5)

In [21]:
//also_buy: Also Buy column contains product that is complementary to the current product. Can be a good option to give to the user using our search engine.
//also_view:  Also view column contains product that is similar to the current product. Can be a good option to give to the user using our search engine.
//asin: productId of our 
//brand: brand to which the product belongs. It is good feature to have as searches often have brand name in the query.
//category: Category is mostly needed for filtering data. Not relevant for our search engine. Thus dropping out this feature.
//date: Not needed as we are not maintaining any priority for the latest reviews.
//description: This field texts can be used for the Inverse Document mapping which will be useful for our searches.
//details: Description is enough and detail is an overkill to the performance. We will drop out this feature.
//feature: This field contains product feature. Will be relevant for our search engine. We will keep this field.
//fit: This is mostly empty and not relevant for electronics. We are dropping out this.
//imageURL: We are keeping only the links to the higher res image for sake of reducing the data. Thus, we are dropping out the field.
//imageURLHighRes: We will keep the link to the product in our search query, which can be made visible to the user.
//main_cat: main_cat is mostly needed for filtering data. Not relevant for our search engine. Thus dropping out this feature.
//price: We need to show the price of the product to the user. Thus we will keep the field.
//rank: Not relevant for our search. Thus, dropping out this field.
//similar_item: We will not keep this field as it is not useful to our search.
//tech1: This is technical table 1 about the product and is not relevant for the search. Thus we are dropping this.
//tech2:  This is technical table 2 about the product and is not relevant for the search. Thus we are dropping this.
//title: This is the title of the product and needs to be displayed to the user. Thus we will keep it.


//Features selected from metaDf are:
//also_buy
//also_view
//asin
//brand
//description
//feature
//imageURLHighRes changed to imageURI
//price
//title


In [22]:
val finalMetaDf = newMetaDf.select($"also_buy", $"also_view", $"asin", $"brand", $"description", $"feature", $"imageURI", $"price", $"title").toDF

In [23]:
//As both the dataframes are cleaned we can join them to create one comprehensive dataframe that we will use for further tasks.

val joinedDf = finalMainDf.join(finalMetaDf, Seq("asin"), "inner")

In [24]:
joinedDf.printSchema

In [25]:
joinedDf.show()

In [26]:
//Profiling the Data
//1. Total No. of Review:
joinedDf.count()

In [27]:
//2. Number of different brands and count of products from each brand
joinedDf.groupBy($"brand").count().show()

In [28]:
//3. Max, Min and Avg price of product across all product.

joinedDf.select(max($"price"), min($"price"), avg($"price")).show()

In [29]:
//4.Max, Min and Avg price of product from each brands.
joinedDf.groupBy($"brand").agg(
      max($"price").as("max_price"),
      min($"price").as("min_price"),
      avg($"price").as("avg_price")).show()

In [30]:
//5. Max, Min and Avg rating of product across all product.
joinedDf.select(max($"overall"), min($"overall"), avg($"overall")).show()

In [31]:
//6. Max, Min and Avg rating of product from each brands.
joinedDf.groupBy($"brand").agg(
      max($"overall").as("max_rating"),
      min($"overall").as("min_rating"),
      avg($"overall").as("avg_rating")).show()