In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, min, max
import gzip
import json

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}  # Corrected variable name from df to df
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

path = "../TensorIot/Industrial_and_Scientific.json.gz"  # Replace with the actual path to your JSON.gz file
df = getDF(path)

In [3]:
# df.head()

In [4]:
df.shape

(1758333, 12)

In [5]:
# overall is the column representing the ratings

least_rated_item = df.nsmallest(1, 'overall').iloc[0]

In [6]:
most_rated_item = df.iloc[df['overall'].idxmax()]

In [7]:
# Assuming 'reviewText' is the column representing the reviews

df['review_length'] = df['reviewText'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

item_with_longest_review = df.iloc[df['review_length'].idxmax()] # Find the item with the longest review

In [8]:
# convert the date column to datetime format
df['reviewTime'] = pd.to_datetime(df['reviewTime'])

df['reviewTime'] = df['reviewTime'].dt.strftime('%m-%d-%Y') # change the datetime format

In [9]:
# Show a desired data frame operation which you learnt recently.


In [11]:
print(least_rated_item)

overall                                                         1.0
verified                                                       True
reviewTime                                              03 21, 2017
reviewerID                                           A1GJXZZPOZ3OD9
asin                                                     0176496920
reviewerName                                        Amazon Customer
reviewText        It only lasted for 3 days before it stopped wo...
summary                                                    One Star
unixReviewTime                                           1490054400
vote                                                            NaN
style                                                           NaN
image                                                           NaN
Name: 4, dtype: object


In [12]:
print(most_rated_item) 

overall                                                         5.0
verified                                                       True
reviewTime                                              01 23, 2013
reviewerID                                           A3FANY5GOT5X0W
asin                                                     0176496920
reviewerName                                           Kelly Keyser
reviewText        Arrived on time, in mint condition, great!  I ...
summary                                          Just as described!
unixReviewTime                                           1358899200
vote                                                            NaN
style                                                           NaN
image                                                           NaN
Name: 0, dtype: object


In [13]:
print("Item with the longest review:")
print(item_with_longest_review[['asin', 'review_length', 'reviewText']])

Item with the longest review:
asin                                                    B0015NV5BE
review_length                                                32184
reviewText       FOR OPERATING A REFRIGERATOR (or freezer ) ONL...
Name: 236852, dtype: object


In [14]:
df['reviewTime']

0          01-23-2013
1          11-05-2012
2          10-17-2012
3          03-29-2017
4          03-21-2017
              ...    
1758328    08-04-2018
1758329    07-10-2018
1758330    01-14-2017
1758331    07-20-2018
1758332    03-09-2018
Name: reviewTime, Length: 1758333, dtype: object

In [15]:
# Convert the whole file into Parquet file after transforming.

df.to_parquet("Industry_output.paraquet")