In [3]:
# package imports
import numpy as np
import matplotlib.pyplot as plt
import requests
import pandas as pd
import os, json


In [17]:
# Load files

base_path = os.getcwd()
print(base_path)
product_csv_path = base_path + '/JCPenneyFile-work-1/products.csv';
users_csv_path = base_path + '/JCPenneyFile-work-1/users.csv';
reviews_csv_path = base_path + '/JCPenneyFile-work-1/reviews.csv';
products_json_path = base_path + '/JCPenneyFile-work-1/jcpenney_products.json';
reviewers_json_path = base_path + '/JCPenneyFile-work-1/jcpenney_reviewers.json';

products_df = pd.read_csv(product_csv_path)
reviews_df = pd.read_csv(reviews_csv_path)
users_df = pd.read_csv(users_csv_path)

# JSON loader as pandas DataFrame *starts*.
# checks for json format issue and fixes it
def load_json_df(path):
    data = []
    with open(path, 'r', encoding="utf-8") as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                 print(f"catching invalid JSON line: {line.strip()}")
        return pd.DataFrame(data)
        # If JSON is a list of objects -> DataFrame; otherwise return raw dict
    if isinstance(data, list):
        return pd.DataFrame(data)
    else:
        return pd.json_normalize(data)    
# *Ends* JSON loader as pandas DataFrame.

reviewers_json_df = load_json_df(reviewers_json_path)
products_json_path = load_json_df(products_json_path)

# File structure(shapes and colums) lookup
for name, df in zip(['products', 'reviews', 'users', 'reviewers', 'jc_products'],
                    [products_df, reviews_df, reviews_df, users_df, reviewers_json_df, products_json_path]):
    print(f"\n{name} DataFrame: shape={df.shape}")
    print("Columns:", df.columns.tolist())
    print("Head:\n", df.head(13))


/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python

products DataFrame: shape=(7982, 6)
Columns: ['Uniq_id', 'SKU', 'Name', 'Description', 'Price', 'Av_Score']
Head:
                              Uniq_id           SKU  \
0   b6c0b6bea69c722939585baeac73c13d  pp5006380337   
1   93e5272c51d8cce02597e3ce67b7ad0a  pp5006380337   
2   013e320f2f2ec0cf5b3ff5418d688528  pp5006380337   
3   505e6633d81f2cb7400c0cfa0394c427  pp5006380337   
4   d969a8542122e1331e304b09f81a83f6  pp5006380337   
5   cf73bb2bd93bbd6e1bdf48d399992270  pp5006380337   
6   8ffd0ef4fcaf1a82fb514aba5d20e05b  pp5006790247   
7   4d9337e3c8f974d3c420cdc5c58b3fc3  pp5007090172   
8   44f8f8f108c6856acf9630dd1d78516d  pp5007080134   
9   8d1d057f5f808c10ce243c222ab0ef6e  pp5007080134   
10  f3e02c48f16b56e8c1f126c8fe762812  pp5007080134   
11  5abf9d28e9e0404369ece10807d99d0e  pp5006790242   
12  82d8a9a627e55ba97a1051068c9823e7  pp5006790242   

                                                 Name  \
0     

# Deep dive into Numpy



In [61]:
# 1. Minimal robust loader: lists files and reads CSV/JSON safely
import os, json
import pandas as pd

print(os.getcwd())
file_list = list(map(lambda fileName: os.getcwd()+"/JCPenneyFile-work-1/"+fileName, ["products.csv","reviews.csv","users.csv","jcpenney_products.json","jcpenney_reviewers.json"]))
available_file_list = [f for f in list(file_list) if os.path.exists(f)]
missing_file_list = [f for f in list(file_list) if f not in available_file_list]

print("available_file_list files:", available_file_list)
print("missing_file_list files:", missing_file_list)

def safe_read(path):
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    if path.endswith(".csv"):
        return pd.read_csv(path)
    elif path.endswith(".json"):
        data = []
        with open(path, "r", encoding="utf-8") as fh:
            for line in fh:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                     print(f"Skipping invalid JSON line: {line.strip()}")
            return pd.DataFrame(data)
            # data = json.load(fh)
        # If JSON is a list of objects -> DataFrame; otherwise return raw dict
        if isinstance(data, list):
            return pd.DataFrame(data)
        else:
            return pd.json_normalize(data)
    else:
        raise ValueError("Unsupported format: " + path)

# Load datasets
products = safe_read(available_file_list[0])
reviews = safe_read(available_file_list[1])
users = safe_read(available_file_list[2])
products_extra =  safe_read(available_file_list[3])
reviews = safe_read(available_file_list[4])
print(reviews)





/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python
available_file_list files: ['/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python/JCPenneyFile-work-1/products.csv', '/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python/JCPenneyFile-work-1/reviews.csv', '/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python/JCPenneyFile-work-1/users.csv', '/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python/JCPenneyFile-work-1/jcpenney_products.json', '/Users/rinold/Documents/assignment-works/3079158-ITNPBD2-python/JCPenneyFile-work-1/jcpenney_reviewers.json']
missing_file_list files: []
      Username         DOB          State  \
0     bkpn1412  31.07.1983         Oregon   
1     gqjs4414  27.07.1998  Massachusetts   
2     eehe1434  08.08.1950          Idaho   
3     hkxj1334  03.08.1969        Florida   
4     jjbd1412  26.07.2001        Georgia   
...        ...         ...            ...   
4995  mfnn1212  27.07.1997       Delaware   
499