In [1]:
import subprocess
import sys
import warnings
import os
import zipfile
import shutil


def silent_install(package):
    subprocess.run([sys.executable, "-m", "pip", "install", package],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

for pkg in ["wordcloud", "nltk", "vaderSentiment", "textblob", "kaggle"]:
    silent_install(pkg)

warnings.simplefilter(action='ignore', category=FutureWarning)
    
import nltk
nltk.download('stopwords', quiet=True)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import string
from collections import Counter
import streamlit as st


# функция загрузки DF с kaggle
def load_kaggle_dataset(dataset_slug: str, file_name: str, extract_to="data", force_download=False):
    zip_path = f"{dataset_slug.split('/')[-1]}.zip"

    if force_download:
        if os.path.exists(zip_path):
            os.remove(zip_path)
            print("♻️ Удалён старый ZIP-файл.")
        if os.path.exists(extract_to):
            shutil.rmtree(extract_to)
            print("♻️ Удалена старая распакованная папка.")
    
    if not os.path.exists(zip_path):
        print("⬇️ Downloading dataset from Kaggle...")
        os.system(f'kaggle datasets download -d {dataset_slug}')
    else:
        print("✅ ZIP-файл уже существует, пропускаем скачивание.")
    
    if not os.path.exists(extract_to):
        print("📦 Extracting dataset...")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(extract_to)
    else:
        print("📁 Папка уже распакована, пропускаем.")
    
    # Загрузка Excel/CSV
    full_path = os.path.join(extract_to, file_name)
    
    if file_name.endswith(".xlsx"):
        df = pd.read_excel(full_path)
    elif file_name.endswith(".csv"):
        df = pd.read_csv(full_path)
    else:
        raise ValueError("Файл должен быть .csv или .xlsx")
    
    print("✅ Датасет загружен в DataFrame.")
    return df

# подгрузка в Jupyter Notebook
df = load_kaggle_dataset(
    dataset_slug="summerburd/stephen-king-books-and-adaptations",
    file_name="primary.xlsx",  
    force_download=True
)


# Подготовка аннотации для wordcloud
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return " ".join(words)
df["Cleaned_Annotation"] = df["Annotation"].astype(str).apply(clean_text)
df = df.dropna(subset=["Annotation"])
text_annotation = " ".join(df["Cleaned_Annotation"].tolist())

# Частотный анализ слов в аннотации
text_annotation = " ".join(df["Cleaned_Annotation"].dropna().astype(str).tolist())
words = text_annotation.split()
filtered_words = [word for word in words if len(word) > 2]
common_words = Counter(filtered_words).most_common(20)
df_common = pd.DataFrame(common_words, columns=["Word", "Frequency"])

# Подготовка жанра для wordcloud
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 2]
    return " ".join(words)
df["Cleaned_Genre"] = df["Genre"].astype(str).apply(clean_text)
df = df.dropna(subset=["Genre"])
text_genre = " ".join(df["Cleaned_Genre"].tolist())

# Частотный анализ слов в жанре
all_words = text_genre.split()
common_words = Counter(all_words).most_common(20)
df_common = pd.DataFrame(common_words, columns=["Word", "Frequency"])

# фиксация цвета
sentiment_colors = {
    "Negative": "#1f77b4",  # синий
    "Neutral": "#ff7f0e",   # оранжевый
    "Positive": "#2ca02c"   # зелёный
}

# Sentiment-анализ с помощью TextBlob
df["Polarity"] = df["Cleaned_Annotation"].apply(lambda x: TextBlob(x).sentiment.polarity)
def classify_textblob(score):
    if score >= 0.2:
        return "Positive"
    elif score <= -0.2:
        return "Negative"
    else:
        return "Neutral"
df["Sentiment_TextBlob"] = df["Polarity"].apply(classify_textblob)

# Sentiment-анализ с помощью vader
analyzer = SentimentIntensityAnalyzer()
df["Sentiment_vader"] = df["Cleaned_Annotation"].apply(lambda x: analyzer.polarity_scores(x)["compound"])
def classify_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"
df["Sentiment_Label"] = df["Sentiment_vader"].apply(classify_sentiment)


# Поиск редкого/уникального жанра книги
all_genres = df["Genre"].dropna().astype(str).str.split().sum()
genre_counts = Counter([g.strip().lower() for g in all_genres])
def rarest_genre(genre_str):
    genres = genre_str.strip().lower().split()
    return min(genres, key=lambda g: genre_counts.get(g, 0))
df["Main_Genre"] = df["Genre"].dropna().astype(str).apply(rarest_genre)


# Группировка по жанрам и тональности, vader
genre_sentiment = df.groupby(["Main_Genre", "Sentiment_Label"]).size().reset_index(name="Count")

# Группировка по году и тону
annot_df = df.groupby(["Year", "Sentiment_Label"])["Main_Genre"].agg(lambda x: ', '.join(x.unique())).reset_index()
trend_df = df.groupby(["Year", "Sentiment_Label"]).size().reset_index(name="Trend")


# Дашборд с обложками

df_sorted = df.sort_values(by="Year")

# Собираем HTML
html_blocks = ""
for _, row in df_sorted.iterrows():
    annotation = row['Annotation'] if pd.notna(row['Annotation']) else ""
    rating = row['goodreads_rating']
    percentage = (rating / 5) * 100  # переводим в %
    
    html_blocks += f"""
    <div class="book-item">
     <div class="book-cover-wrapper">
        <div class="star-rating">
            <div class="stars-outer">
                <div class="stars-inner" style="width: {percentage:.2f}%;"></div>
            </div>
        </div>
        <div class="tooltip">
            <div class="cover-container">
                <img src="{row['Cover_URL']}" class="book-cover">
            </div>      
         </div> 
            <div class="book-title">{row['Book_Title']}</div>
            <div class="book-year">{int(row['Year'])}</div>
            </div>
        
    </div>
    """

# Стили и блок
timeline_html = f"""
<style>
    .timeline-container {{
        display: flex;
        flex-wrap: nowrap;
        overflow-x: auto;
        gap: 20px;
        padding: 20px 0;
    }}
    .book-item {{
        display: flex;
        flex: 0 0 auto;
        flex-direction: column;
        text-align: center;
        align-items: center;
        height: 350px;
        max-width: 160px;
        position: relative;
        font-family: sans-serif;
    }}
    .book-cover {{
        width: 100%;
        border-radius: 8px;
        box-shadow: 0 4px 6px rgba(0,0,0,0.3);
        transition: transform 0.3s ease;
    }}
    .cover-container {{
        height: 290px; /* или сколько тебе подходит */
        display: flex;
        align-items: flex-start; /* чтобы обложки начинались с одной линии сверху */
        justify-content: center;
    }}
    .book-cover:hover {{
        transform: scale(1.2);
        z-index: 2;
    }}
    .tooltip {{
        position: relative;
        display: inline-block;
    }}
    .tooltiptext {{
        visibility: hidden;
        width: 180px;
        background-color: #222;
        color: #fff;
        text-align: left;
        border-radius: 6px;
        padding: 8px;
        position: absolute;
        z-index: 10;
        bottom: 110%;
        left: 50%;
        transform: translateX(-50%);
        opacity: 0;
        transition: opacity 0.3s;
        font-size: 12px;
    }}
    .tooltip:hover .tooltiptext {{
        visibility: visible;
        opacity: 1;
    }}
    .book-title {{
        font-weight: bold;
        margin-top: 5px;
    }}
    .book-year, .book-genre {{
        font-size: 12px;
        color: #666;
    }}

   .star-rating {{
        display: block;
        text-align: center;
        margin-bottom: 20px;
        font-size: 14px;
        position: relative;
        unicode-bidi: bidi-override;
}}

    .stars-outer {{
        color: #ccc;
        position: relative;
        display: inline-block;
        font-size: 18px;
}}

    .stars-outer::before {{
        content: "★★★★★";
}}

    .stars-inner {{
        color: #f39c12;
        position: absolute;
        top: 0;
        left: 0;
        white-space: nowrap;
        overflow: hidden;
        width: 0;
}}

    .stars-inner::before {{
        content: "★★★★★";
}}
</style>

<div class="timeline-container">
    {html_blocks}
</div>
"""

# Показать в ячейке
from IPython.display import display, HTML
display(HTML(timeline_html))

♻️ Удалён старый ZIP-файл.
♻️ Удалена старая распакованная папка.
⬇️ Downloading dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/summerburd/stephen-king-books-and-adaptations
License(s): unknown
📦 Extracting dataset...
✅ Датасет загружен в DataFrame.
