# Hive CLI Lab: 

## Create HDFS folder

In [None]:
hadoop fs -mkdir -p /user/hadoop/nb/amazon_reviews

##  Upload the Parquet file from local filesystem (master node) to HDFS

In [None]:
hadoop fs -put customer_reviews_with_sentiment.parquet /user/hadoop/nb/amazon_reviews/

## Verify the file is present in HDFS

In [None]:
hadoop fs -ls /user/hadoop/nb/amazon_reviews/

##  Launch Hive CLI

In [None]:
hive

## Ensure Hive execution engine is MapReduce (MR)
Run inside the **Hive prompt** (`hive>`).

In [None]:
SET hive.execution.engine=mr;

## Create the database 

In [None]:
CREATE DATABASE IF NOT EXISTS reviews;

## Use the database

In [None]:
USE reviews;

## Drop the table if it already exists 

In [None]:
DROP TABLE IF EXISTS amazon_reviews_parquet;

## Create the EXTERNAL Parquet table (

In [None]:
CREATE EXTERNAL TABLE amazon_reviews_parquet(
  marketplace         STRING,
  customer_id         STRING,
  review_id           STRING,
  product_id          STRING,
  product_parent      STRING,
  product_title       STRING,
  product_category    STRING,
  star_rating         INT,
  helpful_votes       INT,
  total_votes         INT,
  vine                STRING,
  verified_purchase   STRING,
  review_headline     STRING,
  review_body         STRING,
  review_date         STRING,
  sentiment           STRING
)
STORED AS PARQUET
LOCATION '/user/hadoop/nb/amazon_reviews/';

## Confirm the table exists

In [None]:
SHOW TABLES;

## Describe the table schema

In [None]:
DESC amazon_reviews_parquet;

## Preview 10 rows

In [None]:
SELECT * FROM amazon_reviews_parquet LIMIT 10;

## Count reviews by sentiment (descending)

In [None]:
SELECT sentiment, COUNT(*) AS total_reviews
FROM amazon_reviews_parquet
GROUP BY sentiment
ORDER BY total_reviews DESC;

## Count reviews by star rating and sentiment

In [None]:
SELECT star_rating, sentiment, COUNT(*) AS total_reviews
FROM amazon_reviews_parquet
GROUP BY star_rating, sentiment
ORDER BY star_rating, sentiment;

## Top 10 highest-rated reviews with text

In [None]:
SELECT product_title, star_rating, sentiment, review_headline, review_body
FROM amazon_reviews_parquet
ORDER BY star_rating DESC
LIMIT 10;

## 16) Why Parquet?
- Parquet is a **columnar format**.
- Hive can read only required columns (**column pruning**) and skip blocks using metadata.
- This reduces I/O and improves query performance.