In [1]:
# Data_analysis

import pandas as pd
import numpy as np
from scipy import stats

# Books Data Analysis

print("\n Books Data Analysis:")

# Convert ratings to numeric
if "rating" in books_df.columns:
    rating_map = {"One":1, "Two":2, "Three":3, "Four":4, "Five":5}
    books_df["rating_num"] = books_df["rating"].map(rating_map)

# Convert stock to numeric
def parse_stock(stock_value):
    if pd.isna(stock_value):
        return np.nan
    if isinstance(stock_value, str):
        s = stock_value.lower()
        if "in stock" in s:
            return 1
        elif "out of stock" in s:
            return 0
        else:
            nums = [int(t) for t in s.split() if t.isdigit()]
            return nums[0] if nums else np.nan
    return stock_value

if "stock" in books_df.columns:
    books_df["stock_num"] = books_df["stock"].apply(parse_stock)

# Price descriptive stats
if "price" in books_df.columns:
    col = books_df["price"].dropna()
    desc_stats = {
        "mean": round(col.mean(), 2),
        "median": round(col.median(), 2),
        "mode": round(col.mode().iloc[0], 2) if not col.mode().empty else None,
        "std": round(col.std(), 2)
    }
    print(" Descriptive stats for price:\n", desc_stats, "\n")

    # Outlier detection
    q1, q3 = np.percentile(col, [25, 75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    outliers_books = books_df[(books_df["price"] < lower) | (books_df["price"] > upper)]
    print(" Outliers in books prices:\n", outliers_books[["price"]].head(), "\n")

# Frequency distribution for rating
if "rating" in books_df.columns:
    freq_rating = books_df["rating"].value_counts()
    print(" Frequency distribution for rating:\n", freq_rating, "\n")

# Correlation analysis
num_cols = [c for c in ["price","rating_num","stock_num"] if c in books_df.columns]
if len(num_cols) > 1:
    corr_matrix = books_df[num_cols].corr().round(4)
    print("Correlation matrix:\n", corr_matrix, "\n")

# Hypothesis test: Fiction vs Nonfiction
if "category" in books_df.columns:
    g1 = books_df[books_df["category"]=="Fiction"]["price"].dropna()
    g2 = books_df[books_df["category"]=="Nonfiction"]["price"].dropna()
    
    # Check if we have enough data points (need at least 2 per group)
    if len(g1) > 1 and len(g2) > 1:
        # Perform t-test to compare mean prices between groups
        t_stat, p_val = stats.ttest_ind(g1, g2, equal_var=False)  #equal_var=False assumes groups may have different variances since we are unsure if variances are equal.    
        
        print("Hypothesis Test (Fiction vs Non-Fiction):",
              {"t_stat": round(t_stat, 4), "p_val": round(p_val, 4)}, "\n")
    else:
        print(" Not enough data for hypothesis test between Fiction and Nonfiction\n")


#  E-commerce site Data Analysis

print("\n E-commerce site Data Analysis:")

# Price descriptive stats
if "price" in shop_df.columns:
    col = shop_df["price"].dropna()
    desc_stats = {
        "mean": round(col.mean(), 2),
        "median": round(col.median(), 2),
        "mode": round(col.mode().iloc[0], 2) if not col.mode().empty else None,
        "std": round(col.std(), 2)
    }
    print(" Descriptive stats for price:\n", desc_stats, "\n")

    # Outlier detection
    q1, q3 = np.percentile(col, [25, 75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    outliers_shop = shop_df[(shop_df["price"] < lower) | (shop_df["price"] > upper)]
    print(" Outliers in e-commerce site product prices:\n", outliers_shop[["price"]].head(), "\n")

# Frequency distribution for category
if "category" in shop_df.columns:
    freq_cat = shop_df["category"].value_counts()
    print(" Frequency distribution for category:\n", freq_cat, "\n")
    
    # Show subcategories under each category
    if "subcategory" in shop_df.columns:
        for category in freq_cat.index:
            print(f"{category}:")
            subcats = shop_df[shop_df["category"] == category]["subcategory"].value_counts()
            for subcat, count in subcats.items():
                print(f"  {subcat}: {count}")
            print()  # Add blank line between categories

# RSS Data Analysis

print("\n RSS Data Analysis:")

if "date" in rss_df.columns:
    rss_df["date"] = pd.to_datetime(rss_df["date"], errors='coerce')
    freq_date = rss_df["date"].dt.date.value_counts().sort_index()
    print(" Frequency distribution by date:\n", freq_date, "\n")


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Ensure rating_num exists
if 'rating' in books_df.columns:
    rating_map = {"One":1, "Two":2, "Three":3, "Four":4, "Five":5}
    books_df['rating_num'] = books_df['rating'].map(rating_map)

# Prepare data
X = books_df[['rating_num']].dropna()
y = books_df.loc[X.index, 'price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

print(" Linear Regression: Price ~ Rating")
print(f"Coefficient: {model.coef_[0]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.4f}")

# Average price per category
avg_price_cat = books_df.groupby('category')['price'].mean().sort_values(ascending=False)
print("\n Average Price per Category:")
print(avg_price_cat)

# Boxplot for category pricing patterns
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.boxplot(data=books_df, x='category', y='price')
plt.title("Book Prices by Category")
plt.xlabel("Category")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.show()

# Simple similarity: Recommend books with similar category and rating
def recommend_books(title, n=5):
    title=str(input('Enter the Book Title: '))
    if title not in books_df['title'].values:
        return f"Book '{title}' not found."
    
    book = books_df[books_df['title']==title].iloc[0]
    # Similarity score: same category and closest rating
    subset = books_df[books_df['category']==book['category']].copy()
    subset['rating_diff'] = abs(subset['rating_num'] - book['rating_num'])
    recommendations = subset.sort_values('rating_diff').head(n+1)
    # Exclude the original book
    return recommendations[recommendations['title']!=title][['title','category','price','rating']]

# Example
print("\n Recommendations for a book:")
print(recommend_books(books_df['title'].iloc[0]))


# Ensure stock_num exists
def parse_stock(stock_value):
    if pd.isna(stock_value):
        return np.nan
    if isinstance(stock_value, str):
        s = stock_value.lower()
        if "in stock" in s:
            return 1
        elif "out of stock" in s:
            return 0
        else:
            nums = [int(t) for t in s.split() if t.isdigit()]
            return nums[0] if nums else np.nan
    return stock_value

books_df['stock_num'] = books_df['stock'].apply(parse_stock)

# Scatter plot: stock vs price
plt.figure(figsize=(8,6))
sns.scatterplot(data=books_df, x='stock_num', y='price', hue='category')
plt.title("Stock Availability vs Price")
plt.xlabel("Stock (numeric)")
plt.ylabel("Price")
plt.show()

# Correlation
if 'stock_num' in books_df.columns:
    corr_stock_price = books_df[['stock_num','price']].corr().iloc[0,1]
    print(f"\n Correlation between Stock and Price: {corr_stock_price:.4f}")                                                     
    


 Books Data Analysis:


NameError: name 'books_df' is not defined