In [1]:
import subprocess
import sys
import warnings
import os
import zipfile
import shutil


def silent_install(package):
    subprocess.run([sys.executable, "-m", "pip", "install", package],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

for pkg in ["wordcloud", "nltk", "vaderSentiment", "textblob", "kaggle"]:
    silent_install(pkg)

warnings.simplefilter(action='ignore', category=FutureWarning)
    
import nltk
nltk.download('stopwords', quiet=True)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import string
from collections import Counter
import streamlit as st


# —Ñ—É–Ω–∫—Ü–∏—è –∑–∞–≥—Ä—É–∑–∫–∏ DF —Å kaggle
def load_kaggle_dataset(dataset_slug: str, file_name: str, extract_to="data", force_download=False):
    zip_path = f"{dataset_slug.split('/')[-1]}.zip"

    if force_download:
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print("‚ôªÔ∏è –£–¥–∞–ª—ë–Ω —Å—Ç–∞—Ä—ã–π ZIP-—Ñ–∞–π–ª.")
        if os.path.exists(extract_to):
            shutil.rmtree(extract_to)
            print("‚ôªÔ∏è –£–¥–∞–ª–µ–Ω–∞ —Å—Ç–∞—Ä–∞—è —Ä–∞—Å–ø–∞–∫–æ–≤–∞–Ω–Ω–∞—è –ø–∞–ø–∫–∞.")
    
    if not os.path.exists(zip_path):
        print("‚¨áÔ∏è Downloading dataset from Kaggle...")
        os.system(f'kaggle datasets download -d {dataset_slug}')
    else:
        print("‚úÖ ZIP-—Ñ–∞–π–ª —É–∂–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç, –ø—Ä–æ–ø—É—Å–∫–∞–µ–º —Å–∫–∞—á–∏–≤–∞–Ω–∏–µ.")
    
    if not os.path.exists(extract_to):
        print("üì¶ Extracting dataset...")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(extract_to)
    else:
        print("üìÅ –ü–∞–ø–∫–∞ —É–∂–µ —Ä–∞—Å–ø–∞–∫–æ–≤–∞–Ω–∞, –ø—Ä–æ–ø—É—Å–∫–∞–µ–º.")
    
    # –ó–∞–≥—Ä—É–∑–∫–∞ Excel/CSV
    full_path = os.path.join(extract_to, file_name)
    
    if file_name.endswith(".xlsx"):
        df = pd.read_excel(full_path)
    elif file_name.endswith(".csv"):
        df = pd.read_csv(full_path)
    else:
        raise ValueError("–§–∞–π–ª –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å .csv –∏–ª–∏ .xlsx")
    
    print("‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –≤ DataFrame.")
    return df

# –ø–æ–¥–≥—Ä—É–∑–∫–∞ –≤ Jupyter Notebook
df = load_kaggle_dataset(
    dataset_slug="summerburd/stephen-king-books-and-adaptations",
    file_name="primary.xlsx",  
    force_download=True
)


# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏ –¥–ª—è wordcloud
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return " ".join(words)
df["Cleaned_Annotation"] = df["Annotation"].astype(str).apply(clean_text)
df = df.dropna(subset=["Annotation"])
text_annotation = " ".join(df["Cleaned_Annotation"].tolist())

# –ß–∞—Å—Ç–æ—Ç–Ω—ã–π –∞–Ω–∞–ª–∏–∑ —Å–ª–æ–≤ –≤ –∞–Ω–Ω–æ—Ç–∞—Ü–∏–∏
text_annotation = " ".join(df["Cleaned_Annotation"].dropna().astype(str).tolist())
words = text_annotation.split()
filtered_words = [word for word in words if len(word) > 2]
common_words = Counter(filtered_words).most_common(20)
df_common = pd.DataFrame(common_words, columns=["Word", "Frequency"])

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∂–∞–Ω—Ä–∞ –¥–ª—è wordcloud
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return " ".join(words)
df["Cleaned_Genre"] = df["Genre"].astype(str).apply(clean_text)
df = df.dropna(subset=["Genre"])
text_genre = " ".join(df["Cleaned_Genre"].tolist())

# –ß–∞—Å—Ç–æ—Ç–Ω—ã–π –∞–Ω–∞–ª–∏–∑ —Å–ª–æ–≤ –≤ –∂–∞–Ω—Ä–µ
all_words = text_genre.split()
common_words = Counter(all_words).most_common(20)
df_common = pd.DataFrame(common_words, columns=["Word", "Frequency"])

# —Ñ–∏–∫—Å–∞—Ü–∏—è —Ü–≤–µ—Ç–∞
sentiment_colors = {
    "Negative": "#1f77b4",  # —Å–∏–Ω–∏–π
    "Neutral": "#ff7f0e",   # –æ—Ä–∞–Ω–∂–µ–≤—ã–π
    "Positive": "#2ca02c"   # –∑–µ–ª—ë–Ω—ã–π
}

# Sentiment-–∞–Ω–∞–ª–∏–∑ —Å –ø–æ–º–æ—â—å—é TextBlob
df["Polarity"] = df["Cleaned_Annotation"].apply(lambda x: TextBlob(x).sentiment.polarity)
def classify_textblob(score):
    if score >= 0.2:
        return "Positive"
    elif score <= -0.2:
        return "Negative"
    else:
        return "Neutral"
df["Sentiment_TextBlob"] = df["Polarity"].apply(classify_textblob)

# Sentiment-–∞–Ω–∞–ª–∏–∑ —Å –ø–æ–º–æ—â—å—é vader
analyzer = SentimentIntensityAnalyzer()
df["Sentiment_vader"] = df["Cleaned_Annotation"].apply(lambda x: analyzer.polarity_scores(x)["compound"])
def classify_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"
df["Sentiment_Label"] = df["Sentiment_vader"].apply(classify_sentiment)


# –ü–æ–∏—Å–∫ —Ä–µ–¥–∫–æ–≥–æ/—É–Ω–∏–∫–∞–ª—å–Ω–æ–≥–æ –∂–∞–Ω—Ä–∞ –∫–Ω–∏–≥–∏
all_genres = df["Genre"].dropna().astype(str).str.split().sum()
genre_counts = Counter([g.strip().lower() for g in all_genres])
def rarest_genre(genre_str):
    genres = genre_str.strip().lower().split()
    return min(genres, key=lambda g: genre_counts.get(g, 0))
df["Main_Genre"] = df["Genre"].dropna().astype(str).apply(rarest_genre)


# –ì—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∞ –ø–æ –∂–∞–Ω—Ä–∞–º –∏ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç–∏, vader
genre_sentiment = df.groupby(["Main_Genre", "Sentiment_Label"]).size().reset_index(name="Count")

# –ì—Ä—É–ø–ø–∏—Ä–æ–≤–∫–∞ –ø–æ –≥–æ–¥—É –∏ —Ç–æ–Ω—É
annot_df = df.groupby(["Year", "Sentiment_Label"])["Main_Genre"].agg(lambda x: ', '.join(x.unique())).reset_index()
trend_df = df.groupby(["Year", "Sentiment_Label"]).size().reset_index(name="Trend")


# –î–∞—à–±–æ—Ä–¥ —Å –æ–±–ª–æ–∂–∫–∞–º–∏

df_sorted = df.sort_values(by="Year")

# –°–æ–±–∏—Ä–∞–µ–º HTML
html_blocks = ""
for _, row in df_sorted.iterrows():
    annotation = row['Annotation'] if pd.notna(row['Annotation']) else ""
    rating = row['goodreads_rating']
    percentage = (rating / 5) * 100  # –ø–µ—Ä–µ–≤–æ–¥–∏–º –≤ %
    
    html_blocks += f"""
    <div class="book-item">
     <div class="book-cover-wrapper">
        <div class="star-rating">
            <div class="stars-outer">
                <div class="stars-inner" style="width: {percentage:.2f}%;"></div>
            </div>
        </div>
        <div class="tooltip">
            <div class="cover-container">
                <img src="{row['Cover_URL']}" class="book-cover">
            </div>      
         </div> 
            <div class="book-title">{row['Book_Title']}</div>
            <div class="book-year">{int(row['Year'])}</div>
            </div>
        
    </div>
    """

# –°—Ç–∏–ª–∏ –∏ –±–ª–æ–∫
timeline_html = f"""
<style>
    .timeline-container {{
        display: flex;
        flex-wrap: nowrap;
        overflow-x: auto;
        gap: 20px;
        padding: 20px 0;
    }}
    .book-item {{
        display: flex;
        flex: 0 0 auto;
        flex-direction: column;
        text-align: center;
        align-items: center;
        height: 350px;
        max-width: 160px;
        position: relative;
        font-family: sans-serif;
    }}
    .book-cover {{
        width: 100%;
        border-radius: 8px;
        box-shadow: 0 4px 6px rgba(0,0,0,0.3);
        transition: transform 0.3s ease;
    }}
    .cover-container {{
        height: 290px; /* –∏–ª–∏ —Å–∫–æ–ª—å–∫–æ —Ç–µ–±–µ –ø–æ–¥—Ö–æ–¥–∏—Ç */
        display: flex;
        align-items: flex-start; /* —á—Ç–æ–±—ã –æ–±–ª–æ–∂–∫–∏ –Ω–∞—á–∏–Ω–∞–ª–∏—Å—å —Å –æ–¥–Ω–æ–π –ª–∏–Ω–∏–∏ —Å–≤–µ—Ä—Ö—É */
        justify-content: center;
    }}
    .book-cover:hover {{
        transform: scale(1.2);
        z-index: 2;
    }}
    .tooltip {{
        position: relative;
        display: inline-block;
    }}
    .tooltiptext {{
        visibility: hidden;
        width: 180px;
        background-color: #222;
        color: #fff;
        text-align: left;
        border-radius: 6px;
        padding: 8px;
        position: absolute;
        z-index: 10;
        bottom: 110%;
        left: 50%;
        transform: translateX(-50%);
        opacity: 0;
        transition: opacity 0.3s;
        font-size: 12px;
    }}
    .tooltip:hover .tooltiptext {{
        visibility: visible;
        opacity: 1;
    }}
    .book-title {{
        font-weight: bold;
        margin-top: 5px;
    }}
    .book-year, .book-genre {{
        font-size: 12px;
        color: #666;
    }}

   .star-rating {{
        display: block;
        text-align: center;
        margin-bottom: 20px;
        font-size: 14px;
        position: relative;
        unicode-bidi: bidi-override;
}}

    .stars-outer {{
        color: #ccc;
        position: relative;
        display: inline-block;
        font-size: 18px;
}}

    .stars-outer::before {{
        content: "‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ";
}}

    .stars-inner {{
        color: #f39c12;
        position: absolute;
        top: 0;
        left: 0;
        white-space: nowrap;
        overflow: hidden;
        width: 0;
}}

    .stars-inner::before {{
        content: "‚òÖ‚òÖ‚òÖ‚òÖ‚òÖ";
}}
</style>

<div class="timeline-container">
    {html_blocks}
</div>
"""

# –ü–æ–∫–∞–∑–∞—Ç—å –≤ —è—á–µ–π–∫–µ
from IPython.display import display, HTML
display(HTML(timeline_html))

‚ôªÔ∏è –£–¥–∞–ª—ë–Ω —Å—Ç–∞—Ä—ã–π ZIP-—Ñ–∞–π–ª.
‚ôªÔ∏è –£–¥–∞–ª–µ–Ω–∞ —Å—Ç–∞—Ä–∞—è —Ä–∞—Å–ø–∞–∫–æ–≤–∞–Ω–Ω–∞—è –ø–∞–ø–∫–∞.
‚¨áÔ∏è Downloading dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/summerburd/stephen-king-books-and-adaptations
License(s): unknown
üì¶ Extracting dataset...
‚úÖ –î–∞—Ç–∞—Å–µ—Ç –∑–∞–≥—Ä—É–∂–µ–Ω –≤ DataFrame.
