### Files required to run this colab file:
* Groceries_dataset.csv
* 5000_meta_Grocery_and_Gourmet_Food.csv OR meta_Grocery_and_Gourmet_Food.csv
* 5000_Reviews_Grocery_and_Gourmet_Food.csv OR Reviews_Grocery_and_Gourmet_Food.csv


In [332]:
import pandas as pd
import numpy as np

# Amazon Product Details Dataset (Meta) -> for sentiment analysis

In [333]:
df_sentiment = pd.read_csv('/content/meta_Grocery_and_Gourmet_Food.csv')

In [334]:
df_sentiment.tail()

Unnamed: 0,parent_asin,main_category,title,average_rating,rating_number,price,store,features,description,images,...,caffeine_content,roast_level,unit_count,package_dimensions,upc,manufacturer,flavor,color,item_weight,country_of_origin
6239,B0C1M7K2R8,Grocery,"Sunny Fruit Organic Figs,Tender and Juicy Drie...",4.6,687,21.95,SUNNY FRUIT,"12 Convenient & Mess free Portion pack, Great ...",The Sunny fruit difference begins at the sourc...,,...,,,,,842515010118.0,Safe food corp,,,,
6240,B08D3W96DK,Grocery,GERBS Dried Mango Slices Unsweetened 2 LBS. | ...,3.9,3974,31.99,GERBS,Naturally Dried Mango - No Sugar Added (unswee...,,https://m.media-amazon.com/images/I/515yOXg0ka...,...,,,,8.6 x 6.6 x 3.3 inches; 2 Pounds,828678203765.0,GERBS ALLERGY FRIENDLY FOODS,,,,
6241,B07JZ6BMLQ,Grocery,"Old Wisconsin Snack Sticks, Turkey, 1.5-Ounce ...",4.5,83,,Old Wisconsin,"Superior quality | All meat--no binders, fille...",Old Wisconsin Turkey Snack Sticks are great fo...,https://m.media-amazon.com/images/I/81kbcxwhxt...,...,,,,15.5 x 3.1 x 3.1 inches; 1.5 Ounces,73170731241.0,Old Wisconsin,,,,
6242,B0713VD3HF,Grocery,Pickle Juice Soda Pop - 3 Bottles,3.4,34,12.99,Lesters,,3-pack of pickle-flavored soda pop.,https://m.media-amazon.com/images/I/91CuceafAf...,...,,,3.00 Count,,,,Pickle,,5 Pounds,
6243,B0BRRT3D8Z,Grocery,"Prego Traditional Pasta Sauce, 67 Oz Jar",4.7,2654,,Prego,One 67 oz jar of Prego Traditional Pasta Sauce...,Be the hero at dinner with Prego Traditional P...,https://m.media-amazon.com/images/I/81yPY1XI-t...,...,,,,,,,,,,


In [335]:
df_sentiment.shape

(6244, 26)

In [336]:
df_sentiment.columns

Index(['parent_asin', 'main_category', 'title', 'average_rating',
       'rating_number', 'price', 'store', 'features', 'description', 'images',
       'num_images', 'has_videos', 'bought_together', 'categories', 'brand',
       'item_form', 'caffeine_content', 'roast_level', 'unit_count',
       'package_dimensions', 'upc', 'manufacturer', 'flavor', 'color',
       'item_weight', 'country_of_origin'],
      dtype='object')

In [337]:
#calculate number of nans in each column
df_sentiment.isna().sum()

Unnamed: 0,0
parent_asin,0
main_category,42
title,0
average_rating,0
rating_number,0
price,3449
store,151
features,1391
description,2196
images,901


In [338]:
# remove unnecessary columns
df_sentiment = df_sentiment.drop(['features', 'num_images', 'has_videos', 'bought_together', 'item_form', 'caffeine_content', 'roast_level', 'package_dimensions', 'upc', 'item_weight', 'country_of_origin'], axis=1)
# rename 'images' to 'image_url
df_sentiment = df_sentiment.rename(columns={'images': 'image_url'})
df_sentiment.columns


Index(['parent_asin', 'main_category', 'title', 'average_rating',
       'rating_number', 'price', 'store', 'description', 'image_url',
       'categories', 'brand', 'unit_count', 'manufacturer', 'flavor', 'color'],
      dtype='object')

In [339]:
df_sentiment['categories'].head(20)

Unnamed: 0,categories
0,Grocery & Gourmet Food > Beverages > Coffee
1,Grocery & Gourmet Food > Breads & Bakery > Coo...
2,Grocery & Gourmet Food > Pantry Staples > Cann...
3,Grocery & Gourmet Food > Snacks & Sweets > Cho...
4,
5,Grocery & Gourmet Food > Beverages > Bottled B...
6,Grocery & Gourmet Food > Pantry Staples > Soup...
7,Grocery & Gourmet Food > Pantry Staples > Cook...
8,Grocery & Gourmet Food > Home Brewing & Winema...
9,Grocery & Gourmet Food > Pantry Staples > Sauc...



### Reasons for removing:

* features --> similar to description, kept description instead
* num_images --> not needed, only need image url
* has_videos --> not needed, only need image url
* bought_together --> not needed as we are doing market baset analysis on the other dataset
* item_form --> too many missing values and not neccesary
* caffeine_content --> not needed as it is only relevant for coffee
* roast_level --> not needed as it is only relevant for coffee
* package_dimensions --> too many missing values
* upc --> not needed
* item_weight --> similar to unit_count, kept unit_count instead as it has lesser missing values
* country_of_origin --> not needed


In [340]:
df_sentiment['main_category'].unique()

array(['Grocery', 'Amazon Home', 'Industrial & Scientific',
       'Health & Personal Care', nan, 'Home Audio & Theater',
       'Toys & Games', 'All Beauty', 'Pet Supplies',
       'Tools & Home Improvement', 'Arts, Crafts & Sewing',
       'AMAZON FASHION', 'Sports & Outdoors', 'Office Products',
       'Cell Phones & Accessories', 'All Electronics', 'Baby',
       'Automotive', 'Musical Instruments', 'Buy a Kindle',
       'Digital Music'], dtype=object)

In [341]:
# only keep the rows where the main_category = 'Grocery'
df_sentiment = df_sentiment[df_sentiment['main_category'] == 'Grocery']
df_sentiment['main_category'].unique()

array(['Grocery'], dtype=object)

In [342]:
# don't need main_category as we only kept the 'Grocery' category
df_sentiment = df_sentiment.drop('main_category', axis=1)
df_sentiment.columns

Index(['parent_asin', 'title', 'average_rating', 'rating_number', 'price',
       'store', 'description', 'image_url', 'categories', 'brand',
       'unit_count', 'manufacturer', 'flavor', 'color'],
      dtype='object')

In [343]:
# create a column called 'item' and 'category' from 'categories' and then remove 'categories'
df_sentiment['item'] = df_sentiment['categories'].str.split('>').str[-1].str.strip()
df_sentiment['category'] = df_sentiment['categories'].str.split('>').str[-2].str.strip()
df_sentiment = df_sentiment.drop('categories', axis=1)

df_sentiment.head(5)

Unnamed: 0,parent_asin,title,average_rating,rating_number,price,store,description,image_url,brand,unit_count,manufacturer,flavor,color,item,category
0,B00NE08WM6,Dark Roast Pure Coffee,4.7,9,,Luzianne,,https://m.media-amazon.com/images/I/81ucKSAein...,Luzianne,13.00 Ounce,,,,Coffee,Beverages
1,B084Q13Q5Q,PICARAS Galletas Peruanas Bañadas en Chocolate...,4.5,12,15.99,Winters,2 Bags of PICARAS Galletas Bañadas en Chocolat...,https://m.media-amazon.com/images/I/81-wr7u+ig...,Winters,12.00 Count,Winter's,chocolate,,Chocolate,Cookies
2,B00KBRUYVM,Chipped Beef and Gravy By Patterson's - Great ...,3.2,5,,Pattersons,Delicious corned beef gravy is ready when you ...,https://m.media-amazon.com/images/I/71MkAus3+q...,,,Patterson's,,,Tomatoes,Vegetables
3,B0BN4PW255,Asher's Sugar Free Milk Chocolate Cordial Cher...,5.0,6,29.99,Generic,,,Generic,16.0 Ounce,,Chocolate,,Fruit,Chocolate
4,B06X9DC27H,Messmer Peppermint 25 bags (6er pack),3.5,5,29.99,Messmer,,,Messmer,1.00 Count,,,,,


In [344]:
# Get column names
cols = df_sentiment.columns.tolist()

# Rearrange: last two + everything else (put 'item' and 'category' in the start)
new_order = [cols[-2], cols[-1]] + cols[:-2]
df_sentiment = df_sentiment[new_order]

df_sentiment.head()

Unnamed: 0,item,category,parent_asin,title,average_rating,rating_number,price,store,description,image_url,brand,unit_count,manufacturer,flavor,color
0,Coffee,Beverages,B00NE08WM6,Dark Roast Pure Coffee,4.7,9,,Luzianne,,https://m.media-amazon.com/images/I/81ucKSAein...,Luzianne,13.00 Ounce,,,
1,Chocolate,Cookies,B084Q13Q5Q,PICARAS Galletas Peruanas Bañadas en Chocolate...,4.5,12,15.99,Winters,2 Bags of PICARAS Galletas Bañadas en Chocolat...,https://m.media-amazon.com/images/I/81-wr7u+ig...,Winters,12.00 Count,Winter's,chocolate,
2,Tomatoes,Vegetables,B00KBRUYVM,Chipped Beef and Gravy By Patterson's - Great ...,3.2,5,,Pattersons,Delicious corned beef gravy is ready when you ...,https://m.media-amazon.com/images/I/71MkAus3+q...,,,Patterson's,,
3,Fruit,Chocolate,B0BN4PW255,Asher's Sugar Free Milk Chocolate Cordial Cher...,5.0,6,29.99,Generic,,,Generic,16.0 Ounce,,Chocolate,
4,,,B06X9DC27H,Messmer Peppermint 25 bags (6er pack),3.5,5,29.99,Messmer,,,Messmer,1.00 Count,,,


In [345]:
print('Number of unique items:',df_sentiment['item'].nunique(),'\n')
df_sentiment['item'].unique()


Number of unique items: 779 



array(['Coffee', 'Chocolate', 'Tomatoes', 'Fruit', nan, 'Energy Drinks',
       'Soups, Stocks & Broths', 'Hot Sauce',
       'Single-Serve Capsules & Pods', 'Oatmeal', 'White',
       'Food Coloring', 'Graham Crackers', 'Candy & Chocolate Gifts',
       'Snack Gifts', 'Sea Salt', 'Candy & Chocolate Bars', 'Vegetable',
       'Sour Candy', 'Stevia', 'Licorice', 'Dried Mixed Fruits',
       'Candy & Chocolate Assortments', 'Sea Vegetables',
       'Chips & Crisps', 'Mexican Seasoning', 'Instant Coffee',
       'Potatoes & Yams', 'Green', 'Bloody Mary', 'Gummy Candy',
       'Iced Tea', 'Ground Pepper', 'Tea Samplers', 'Hard Candy',
       'Herb, Spice & Seasoning Gifts', 'Meat, Poultry & Seafood',
       'Honey', 'Potato', 'Cake Toppers', 'Chewing & Bubble Gum',
       'Ground Coffee', 'Sandwich', 'Mixed Nuts', 'Roasted Coffee Beans',
       'Cakes', 'Mints', 'Matzo', 'Cocoa', 'Cold Cereals', 'Herbal',
       'Ice Cream', 'Cookies', 'Pasta & Noodles', 'Peanuts',
       'Candy & Chocolat

# Transactions Dataset --> for market basket analysis

In [346]:
# load the market basket dataset
df_market_basket = pd.read_csv('/content/Groceries_dataset.csv')
df_market_basket.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,5/1/2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12/12/2015,other vegetables
4,3037,1/2/2015,whole milk


In [347]:
# unique items from the market basket dataset
print('Number of Unique Values in itemDescription', df_market_basket['itemDescription'].nunique())
print('Unique Values in itemDescription', df_market_basket['itemDescription'].unique())

Number of Unique Values in itemDescription 167
Unique Values in itemDescription ['tropical fruit' 'whole milk' 'pip fruit' 'other vegetables' 'rolls/buns'
 'pot plants' 'citrus fruit' 'beef' 'frankfurter' 'chicken' 'butter'
 'fruit/vegetable juice' 'packaged fruit/vegetables' 'chocolate'
 'specialty bar' 'butter milk' 'bottled water' 'yogurt' 'sausage'
 'brown bread' 'hamburger meat' 'root vegetables' 'pork' 'pastry'
 'canned beer' 'berries' 'coffee' 'misc. beverages' 'ham' 'turkey'
 'curd cheese' 'red/blush wine' 'frozen potato products' 'flour' 'sugar'
 'frozen meals' 'herbs' 'soda' 'detergent' 'grapes' 'processed cheese'
 'fish' 'sparkling wine' 'newspapers' 'curd' 'pasta' 'popcorn'
 'finished products' 'beverages' 'bottled beer' 'dessert' 'dog food'
 'specialty chocolate' 'condensed milk' 'cleaner' 'white wine' 'meat'
 'ice cream' 'hard cheese' 'cream cheese ' 'liquor' 'pickled vegetables'
 'liquor (appetizer)' 'UHT-milk' 'candy' 'onions' 'hair spray'
 'photo/film' 'domestic eggs' 

In [348]:
# number of rows in amazon meta dataset BEFORE mapping
df_sentiment.shape[0]

5672

In [349]:
# keeping rows from sentiment dataset which has items from the market_basket data only

# Partial matching (if "Coffee" should match "coffee" or "instant coffee")
def matches_any(item, items_list):
    item_lower = str(item).lower()
    return any(item.lower() in item_lower or item_lower in item.lower()
               for item in items_list)

market_basket_items = df_market_basket['itemDescription'].unique()
df_sentiment = df_sentiment[df_sentiment['item'].apply(
    lambda x: matches_any(x, market_basket_items)
)]

In [350]:
# number of rows in amazon meta dataset AFTER mapping
df_sentiment.shape[0]

1995

In [351]:
df_sentiment['item'].nunique()

220

### Create item_id to connect everything

In [352]:
# create item_id in both the dataset which will be used to link them

# Step 1: Get unique items and create ID mapping
unique_items = df_market_basket['itemDescription'].unique()
item_to_id = {item: idx for idx, item in enumerate(unique_items)}

# Step 2: Apply to df_market_basket
df_market_basket['item_id'] = df_market_basket['itemDescription'].map(item_to_id)

# Step 3: Apply to df_sentiment with partial matching
def get_item_id(item, item_dict, items_list):
    item_lower = str(item).lower()

    for basket_item in items_list:
        basket_lower = basket_item.lower()
        if basket_lower in item_lower or item_lower in basket_lower:
            return item_dict.get(basket_item)

    return None

df_sentiment['item_id'] = df_sentiment['item'].apply(
    lambda x: get_item_id(x, item_to_id, unique_items)
)

In [353]:
df_market_basket.head()


Unnamed: 0,Member_number,Date,itemDescription,item_id
0,1808,21-07-2015,tropical fruit,0
1,2552,5/1/2015,whole milk,1
2,2300,19-09-2015,pip fruit,2
3,1187,12/12/2015,other vegetables,3
4,3037,1/2/2015,whole milk,1


In [354]:
print('Unique Number of item_id in market_basket data: ', df_market_basket['item_id'].nunique())
print('Unique Number of item_id in sentiment data: ', df_sentiment['item_id'].nunique())


Unique Number of item_id in market_basket data:  167
Unique Number of item_id in sentiment data:  67


In [355]:
df_sentiment.head(20)

Unnamed: 0,item,category,parent_asin,title,average_rating,rating_number,price,store,description,image_url,brand,unit_count,manufacturer,flavor,color,item_id
0,Coffee,Beverages,B00NE08WM6,Dark Roast Pure Coffee,4.7,9,,Luzianne,,https://m.media-amazon.com/images/I/81ucKSAein...,Luzianne,13.00 Ounce,,,,26
1,Chocolate,Cookies,B084Q13Q5Q,PICARAS Galletas Peruanas Bañadas en Chocolate...,4.5,12,15.99,Winters,2 Bags of PICARAS Galletas Bañadas en Chocolat...,https://m.media-amazon.com/images/I/81-wr7u+ig...,Winters,12.00 Count,Winter's,chocolate,,13
3,Fruit,Chocolate,B0BN4PW255,Asher's Sugar Free Milk Chocolate Cordial Cher...,5.0,6,29.99,Generic,,,Generic,16.0 Ounce,,Chocolate,,0
6,"Soups, Stocks & Broths",Pantry Staples,B002HQF1BI,Chincoteague Seafood 90944 Vegetable Red Crab ...,5.0,2,73.57,Chincoteague Seafood,Vegetable Red Crab Soup is a colorful combinat...,,Chincoteague Seafood,,Chincoteague Seafood,Red Crab,,121
12,White,Dried Grains & Rice,B0BYLK9168,"TAJ Indian Poha Powa Flattened Rice (Thick, 4-...",4.4,203,17.99,TAJ Gourmet Foods,TAJ Gourmet Foods provides the best of quality.,https://m.media-amazon.com/images/I/81gnkkdr8b...,TAJ Gourmet Foods,1.00 Count,TAJ Gourmet Foods,,,55
14,Graham Crackers,Crackers,B01GNI4V3E,"Glendee Toasted Coconut Chips,Honey, 1.41 Ounc...",2.0,2,,Glendee,Coconut chips are a healthy snack with all the...,https://m.media-amazon.com/images/I/51E98OgptJ...,,,Glendee,,,28
16,Candy & Chocolate Gifts,Candy & Chocolate,B00LGZ6H3W,"Strawberry Delights Hard Candy, 3 lb Bag in a ...",3.9,8,14.99,Black Tie Mercantile,About This BlackTie Box: | Introducing the Bla...,https://m.media-amazon.com/images/I/81sAsNh8lY...,,,Black Tie Mercantile,,,13
18,Sea Salt,Salt & Salt Substitutes,B005GTTQMI,"Eden Organic Gomasio, Sesame Seeds and Sea Sal...",4.4,157,9.99,Eden,,https://m.media-amazon.com/images/I/71XkGm3Jpy...,Eden,,Eden Foods,Savory,,71
19,Fruit,Cookies,B01LF70OL6,Newtons Fat Free Soft & Fruit Chewy Fig Cookie...,4.5,1022,4.59,Fig Newtons,Newtons Fat Free Soft and Fruit Chewy Fig Cook...,https://m.media-amazon.com/images/I/81Tex99-ig...,,,Mondelez Int. US,,,0
20,Candy & Chocolate Bars,Chocolate,B07SFZTL7T,Godiva Chocolatier Assorted Chocolate Advent C...,4.1,88,,Godiva Chocolatier,Let the countdown begin this gorgeous chocolat...,https://m.media-amazon.com/images/I/710khhpkRz...,,,AmazonUs/GODMS,,,13


In [356]:
# all different types of coffee are under the same item_id
print(df_sentiment[df_sentiment['item_id'] == 26])


                      item            category parent_asin  \
0                   Coffee           Beverages  B00NE08WM6   
36          Instant Coffee              Coffee  B01EM498G6   
61           Ground Coffee              Coffee  B00AI13WJW   
67    Roasted Coffee Beans  Whole Coffee Beans  B0091YJHUU   
103         Instant Coffee              Coffee  B00GAT2ZRQ   
...                    ...                 ...         ...   
6044         Ground Coffee              Coffee  B000209K22   
6059                Coffee           Beverages  B07QLYCTHC   
6138        Instant Coffee              Coffee  B0BCMBGFFM   
6185         Ground Coffee              Coffee  B00GXOQUDI   
6213         Ground Coffee              Coffee  B009LI7O2E   

                                                  title  average_rating  \
0                                Dark Roast Pure Coffee             4.7   
36                    DXN Vita Cafe 6 in 1 (Pack of 20)             4.7   
61    Coffee Fool's Ethiopian 

In [357]:
df_sentiment.columns

Index(['item', 'category', 'parent_asin', 'title', 'average_rating',
       'rating_number', 'price', 'store', 'description', 'image_url', 'brand',
       'unit_count', 'manufacturer', 'flavor', 'color', 'item_id'],
      dtype='object')

In [358]:
# creating different dfs to create tables
df_product = df_sentiment[['item_id', 'item', 'category']]
df_ratings = df_sentiment[['item_id', 'parent_asin', 'average_rating', 'rating_number']] # add from other reviews dataset using 'parent_asin'
df_product_details = df_sentiment[['item_id', 'title',
        'price', 'store', 'description', 'image_url', 'brand',
        'unit_count', 'manufacturer', 'flavor', 'color']]

# remove duplicates of item_id
df_product = df_product.drop_duplicates(subset='item_id')
df_product_details = df_product_details.drop_duplicates(subset='item_id')


### linking via 'item_id'
* product -- ratings
* product -- product_details
* ratings -- product_details

In [359]:
df_product.head()

Unnamed: 0,item_id,item,category
0,26,Coffee,Beverages
1,13,Chocolate,Cookies
3,0,Fruit,Chocolate
6,121,"Soups, Stocks & Broths",Pantry Staples
12,55,White,Dried Grains & Rice


In [360]:
df_ratings.head()

Unnamed: 0,item_id,parent_asin,average_rating,rating_number
0,26,B00NE08WM6,4.7,9
1,13,B084Q13Q5Q,4.5,12
3,0,B0BN4PW255,5.0,6
6,121,B002HQF1BI,5.0,2
12,55,B0BYLK9168,4.4,203


In [361]:
df_product_details.head()

Unnamed: 0,item_id,title,price,store,description,image_url,brand,unit_count,manufacturer,flavor,color
0,26,Dark Roast Pure Coffee,,Luzianne,,https://m.media-amazon.com/images/I/81ucKSAein...,Luzianne,13.00 Ounce,,,
1,13,PICARAS Galletas Peruanas Bañadas en Chocolate...,15.99,Winters,2 Bags of PICARAS Galletas Bañadas en Chocolat...,https://m.media-amazon.com/images/I/81-wr7u+ig...,Winters,12.00 Count,Winter's,chocolate,
3,0,Asher's Sugar Free Milk Chocolate Cordial Cher...,29.99,Generic,,,Generic,16.0 Ounce,,Chocolate,
6,121,Chincoteague Seafood 90944 Vegetable Red Crab ...,73.57,Chincoteague Seafood,Vegetable Red Crab Soup is a colorful combinat...,,Chincoteague Seafood,,Chincoteague Seafood,Red Crab,
12,55,"TAJ Indian Poha Powa Flattened Rice (Thick, 4-...",17.99,TAJ Gourmet Foods,TAJ Gourmet Foods provides the best of quality.,https://m.media-amazon.com/images/I/81gnkkdr8b...,TAJ Gourmet Foods,1.00 Count,TAJ Gourmet Foods,,


# Amazon Reviews Dataset --> for sentiment analysis

In [362]:
# load the reviews dataset for sentiment analysis
df_sentiment_reviews = pd.read_csv('/content/Reviews_Grocery_and_Gourmet_Food.csv')
df_sentiment_reviews.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,verified_purchase,helpful_vote
0,5.0,Excellent! Yummy!,Excellent!! Yummy! Great with other foods and...,,B00CM36GAQ,B00CM36GAQ,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1587854000000.0,True,0.0
1,5.0,Delicious!!! Yum!,Excellent! The best! I use it with my beef a...,,B074J5WVYH,B0759B7KLH,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1587854000000.0,True,0.0
2,5.0,"Extremely Delicious, but expensive imo",These are very tasty. They are extremely soft ...,,B079TRNVHX,B079TRNVHX,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1587853000000.0,True,1.0
3,5.0,Delicious!,My favorite!,,B07194LN2Z,B07194LN2Z,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1581313000000.0,True,0.0
4,5.0,Great taste,Great for making brownies and crinkle cookies.,,B005CD4196,B005CD4196,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1581313000000.0,True,7.0


In [363]:
df_sentiment_reviews.shape # full data (7082832, 10)

(20011, 10)

In [364]:
df_sentiment_reviews.isna().sum()

Unnamed: 0,0
rating,0
title,0
text,1
images,19439
asin,1
parent_asin,1
user_id,1
timestamp,1
verified_purchase,1
helpful_vote,1


In [365]:
# drop 'images' column since most are null
df_sentiment_reviews = df_sentiment_reviews.drop('images', axis=1)
df_sentiment_reviews.columns

Index(['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'verified_purchase', 'helpful_vote'],
      dtype='object')

In [366]:
df_sentiment_reviews['verified_purchase'].unique()

array([True, False, nan], dtype=object)

In [367]:
# keep rows where 'verified_purchase' == 'TRUE' as those are the only people who bought the product
# 'verified_purchase' == 'FALSE' means did not buy product
df_sentiment_reviews = df_sentiment_reviews[df_sentiment_reviews['verified_purchase'] == True]
df_sentiment_reviews['verified_purchase'].unique()

array([True], dtype=object)

In [368]:
df_sentiment_reviews.shape

(16089, 9)

In [369]:
df_sentiment_reviews.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,verified_purchase,helpful_vote
0,5.0,Excellent! Yummy!,Excellent!! Yummy! Great with other foods and...,B00CM36GAQ,B00CM36GAQ,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1587854000000.0,True,0.0
1,5.0,Delicious!!! Yum!,Excellent! The best! I use it with my beef a...,B074J5WVYH,B0759B7KLH,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1587854000000.0,True,0.0
2,5.0,"Extremely Delicious, but expensive imo",These are very tasty. They are extremely soft ...,B079TRNVHX,B079TRNVHX,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1587853000000.0,True,1.0
3,5.0,Delicious!,My favorite!,B07194LN2Z,B07194LN2Z,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1581313000000.0,True,0.0
4,5.0,Great taste,Great for making brownies and crinkle cookies.,B005CD4196,B005CD4196,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1581313000000.0,True,7.0


In [370]:
df_sentiment_reviews.shape

(16089, 9)

In [371]:
df_sentiment_reviews[['parent_asin','asin']].nunique()

Unnamed: 0,0
parent_asin,10506
asin,12028


In [372]:
# figuring out PK for reviews --> user_id + parent_asin + asin

# df_sentiment_reviews1 = df_sentiment_reviews.copy()
# df_sentiment_reviews1['user_id_parent_asin_asin_combine'] = df_sentiment_reviews['user_id'] + '_' + df_sentiment_reviews['parent_asin']+ '_' + df_sentiment_reviews['asin']
# df_sentiment_reviews1['user_id_parent_asin_asin_combine'].nunique()
# df_sentiment_reviews1.head()

In [373]:
# Remove database .bd file if it already exists in collab
import shutil
shutil.rmtree('/content/spark-warehouse/products.db', ignore_errors=True)


# Create a dataframe for customers using Faker

In [374]:
!pip install faker



In [375]:
from faker import Faker
import pandas as pd
import random

# Initialize Faker
fake = Faker()

# Get unique user_ids from reviews dataset
unique_user_ids = df_sentiment_reviews['user_id'].unique()

# Create lists for fake data
first_names = []
last_names = []
ages = []

# Generate fake data for each unique user_id
for _ in range(len(unique_user_ids)):
    first_names.append(fake.first_name())
    last_names.append(fake.last_name())
    ages.append(random.randint(10, 100))

# Create the customer DataFrame
df_customer = pd.DataFrame({
    'user_id': unique_user_ids,
    'first_name': first_names,
    'last_name': last_names,
    'age': ages
})

# View the result
print(df_customer.head())

                        user_id first_name last_name  age
0  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ  Alexandra    Gibson   29
1  AGGZ357AO26RQZVRLGU4D4N52DZQ      Karen    Wilson   74
2  AG2L7H23R5LLKDKLBEF2Q3L2MVDA       Kyle    Burton   87
3  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ    Jeffrey    Berger   42
4  AGXVBIUFLFGMVLATYXHJYL4A5Q7Q      Brian    Bowers   57


### Create a database with tables from all the above three dataframes

In [376]:
# ALL DATASETS COMBINED INTO ONE DATABASE AS INDIVIDUAL TABLES
from pyspark.sql import SparkSession

# Step 1: Initialize Spark session
spark = SparkSession.builder \
    .appName("Creating Databases") \
    .getOrCreate()

# Step 2: Create the database
spark.sql("CREATE DATABASE IF NOT EXISTS Products")

# Step 3: Convert pandas DataFrames to PySpark DataFrames
df_transaction_spark = spark.createDataFrame(df_market_basket)
df_product_spark = spark.createDataFrame(df_product)
df_ratings_spark = spark.createDataFrame(df_ratings)
df_sentiment_reviews_spark = spark.createDataFrame(df_sentiment_reviews)
df_product_details_spark = spark.createDataFrame(df_product_details)
df_customer_spark = spark.createDataFrame(df_customer)

# # Merge df_sentiment_reviews with df_ratings on 'parent_asin'
# df_reviews_combined = df_ratings.merge(
#     df_sentiment_reviews,
#     on='parent_asin',
#     how='left'  # Use 'inner' if you only want matching rows
# )

# Step 4: Rename 'itemDescription' column in the transactions table to 'item' to keep it similar to products table
df_transaction_spark = df_transaction_spark.withColumnRenamed('itemDescription', 'item')


# Step 5: Save tables to the Products database
df_transaction_spark.write.mode("overwrite").saveAsTable("Products.transactions")
df_product_spark.write.mode("overwrite").saveAsTable("Products.product")
df_ratings_spark.write.mode("overwrite").saveAsTable("Products.ratings")
df_sentiment_reviews_spark.write.mode("overwrite").saveAsTable("Products.reviews")
df_product_details_spark.write.mode("overwrite").saveAsTable("Products.product_details")
df_customer_spark.write.mode("overwrite").saveAsTable("Products.customer")


# Step 6: Verify tables were created
spark.sql("SHOW TABLES IN Products").show()

# Step 7: Query the tables
print("=== CUSTOMER TABLE ===")
spark.sql("SELECT * FROM Products.customer LIMIT 10").show()

print("=== PRODUCT TABLE ===")
spark.sql("SELECT * FROM Products.product LIMIT 10").show()
#
print("\n=== PRODUCT DETAILS TABLE ===")
spark.sql("SELECT * FROM Products.product_details LIMIT 10").show()

print("\n=== RATINGS TABLE ===")
spark.sql("SELECT * FROM Products.ratings LIMIT 10").show()

print("\n=== REVIEWS TABLE ===")
spark.sql("SELECT * FROM Products.reviews LIMIT 10").show()

print("\n=== TRANSACTIONS TABLE ===")
spark.sql("SELECT * FROM Products.transactions LIMIT 10").show()


+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
| products|            customer|      false|
| products|             product|      false|
| products|     product_details|      false|
| products|             ratings|      false|
| products|             reviews|      false|
| products|        transactions|      false|
|         |       temp_customer|       true|
|         |        temp_product|       true|
|         |temp_product_details|       true|
|         |        temp_ratings|       true|
|         |        temp_reviews|       true|
|         |   temp_transactions|       true|
+---------+--------------------+-----------+

=== CUSTOMER TABLE ===
+--------------------+----------+---------+---+
|             user_id|first_name|last_name|age|
+--------------------+----------+---------+---+
|AHNTBBOCVJSXTQ7SB...|     Linda|   Keller| 64|
|AGM3YP753ITBWIEJ7...|    Angela|  Mcgrath| 35|
|AEIMSJ6H563BKNO

### CREATE .db file of the created database using sqlite3

In [377]:
import sqlite3

# Create SQLite database
conn = sqlite3.connect('/content/products.db')

tables = ['customer', 'product', 'product_details', 'ratings', 'reviews', 'transactions']

for table in tables:
    df = spark.sql(f"SELECT * FROM Products.{table}")
    df_pandas = df.toPandas()
    df_pandas.to_sql(table, conn, if_exists='replace', index=False)
    print(f"Exported {table} to products.db")

conn.close()

# Download the database file (in Colab)
from google.colab import files
files.download('/content/products.db')

Exported customer to products.db
Exported product to products.db
Exported product_details to products.db
Exported ratings to products.db
Exported reviews to products.db
Exported transactions to products.db


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>