# SpecialTopic_HM01
Final notebook with dataset downloader and **all 15 completed visualization scenarios**.

In [None]:

import requests, zipfile, io, pandas as pd, seaborn as sns, matplotlib.pyplot as plt

url = "https://business.yelp.com/external-assets/files/Yelp-JSON.zip"
headers = {'User-Agent': 'Mozilla/5.0'}

print("Downloading Yelp dataset... (This may take 1–3 minutes)")
resp = requests.get(url, stream=True, headers=headers)
resp.raise_for_status()

z = zipfile.ZipFile(io.BytesIO(resp.content))
z.extractall("yelp_dataset")
print("Extracted ✓")

path="yelp_dataset/"

business = pd.read_json(path+"yelp_academic_dataset_business.json", lines=True)
review = pd.read_json(path+"yelp_academic_dataset_review.json", lines=True)
user = pd.read_json(path+"yelp_academic_dataset_user.json", lines=True)

sns.set(style="whitegrid")


## Scenario 1

In [None]:
# Scenario 1 — Line Plot
review['date']=pd.to_datetime(review['date'])
monthly = review.groupby(review['date'].dt.to_period('M')).size().reset_index(name='count')
monthly['date']=monthly['date'].astype(str)

plt.figure(figsize=(12,5))
sns.lineplot(data=monthly, x='date', y='count')
plt.xticks(rotation=45)
plt.title("Monthly Review Count")
plt.show()

## Scenario 2

In [None]:
# Scenario 2 — Scatter Plot
df = business[['review_count','stars']].dropna()
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x='review_count', y='stars')
plt.xscale("log")
plt.title("Review Count vs Stars")
plt.show()

## Scenario 3

In [None]:
# Scenario 3 — Rel Plot
cities = business[business['city'].isin(['Phoenix','Charlotte'])]
sns.relplot(data=cities, x='review_count', y='stars', col='city', kind='scatter')
plt.show()

## Scenario 4

In [None]:
# Scenario 4 — Bar Plot
business_price = business[['stars','attributes']].dropna()
business_price['price'] = business_price['attributes'].apply(lambda x: x.get('RestaurantsPriceRange2') if isinstance(x, dict) else None)
business_price = business_price.dropna()
business_price['price']=business_price['price'].astype(int)

plt.figure(figsize=(7,5))
sns.barplot(data=business_price, x='price', y='stars')
plt.title("Mean Stars by Price Level")
plt.show()

## Scenario 5

In [None]:
# Scenario 5 — Count Plot
plt.figure(figsize=(8,5))
sns.countplot(data=review, x='stars')
plt.title("Distribution of Review Stars")
plt.show()

## Scenario 6

In [None]:
# Scenario 6 — Catplot
cats = ["Italian","Mexican","Chinese"]
biz_cat = business[business['categories'].astype(str).str.contains("|".join(cats))]

merged = review.merge(biz_cat[['business_id','categories']], on='business_id')
merged['cat']=merged['categories'].str.extract(f"({'|'.join(cats)})")

sns.catplot(data=merged, x='stars', col='cat', kind='count')
plt.show()

## Scenario 7

In [None]:
# Scenario 7 — Distplot (Histogram + KDE)
plt.figure(figsize=(8,5))
sns.histplot(user['review_count'], kde=True)
plt.xlim(0,200)
plt.title("User Review Count Distribution")
plt.show()

## Scenario 8

In [None]:
# Scenario 8 — KDE Plot
plt.figure(figsize=(7,5))
sns.kdeplot(data=business, x='stars', fill=True)
plt.title("KDE of Business Stars")
plt.show()

## Scenario 9

In [None]:
# Scenario 9 — Swarm Plot
cats = ['Restaurants','Shopping','Hotels']
biz_sel = business[business['categories'].astype(str).str.contains("|".join(cats))]
biz_sel['cat']=biz_sel['categories'].str.extract(f"({'|'.join(cats)})")

plt.figure(figsize=(10,6))
sns.swarmplot(data=biz_sel, x='cat', y='stars')
plt.title("Stars by Business Category")
plt.show()

## Scenario 10

In [None]:
# Scenario 10 — Strip Plot
city="Phoenix"
biz_ph = business[business['city']==city].dropna(subset=['postal_code'])

plt.figure(figsize=(14,5))
sns.stripplot(data=biz_ph, x='postal_code', y='stars', jitter=True)
plt.xticks(rotation=90)
plt.title("Stars by Postal Code in Phoenix")
plt.show()

## Scenario 11

In [None]:
# Scenario 11 — Box Plot
cats = ['Restaurants','Shopping','Hotels','Beauty & Spas']
biz_sel = business[business['categories'].astype(str).str.contains("|".join(cats))]
biz_sel['cat']=biz_sel['categories'].str.extract(f"({'|'.join(cats)})")

plt.figure(figsize=(10,6))
sns.boxplot(data=biz_sel, x='cat', y='review_count')
plt.yscale("log")
plt.title("Review Count by Category")
plt.show()

## Scenario 12

In [None]:
# Scenario 12 — Pair Plot
df = user[['review_count','useful','fans','average_stars']].dropna()
sns.pairplot(df)
plt.show()

## Scenario 13

In [None]:
# Scenario 13 — Reg Plot
df = business[['review_count','stars']].dropna()
plt.figure(figsize=(8,5))
sns.regplot(data=df, x='review_count', y='stars', scatter_kws={'s':5})
plt.xscale("log")
plt.title("Regression: Review Count vs Stars")
plt.show()

## Scenario 14

In [None]:
# Scenario 14 — Joint Plot
df = business[['review_count','stars']]
sns.jointplot(data=df, x='review_count', y='stars', kind='scatter')
plt.show()

## Scenario 15

In [None]:
# Scenario 15 — Heatmap
df = user[['review_count','useful','funny','cool','fans']].dropna()
corr=df.corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("User Feature Correlation Heatmap")
plt.show()