In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

sns.set(style="whitegrid")

# Загрузка датасета
df = pd.read_csv("dataset.csv")
df.head()


In [None]:
df.info()
df.isnull().sum()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df["popularity"], bins=50, kde=True, color="skyblue")
plt.title("Распределение популярности треков")
plt.xlabel("Популярность (popularity)")
plt.ylabel("Количество треков")
plt.show()


In [None]:
num_cols = df.select_dtypes(include=["int64", "float64", "bool"]).columns
correlations = df[num_cols].corr()["popularity"].sort_values(ascending=False)

plt.figure(figsize=(8, 10))
sns.barplot(
    x=correlations.values,
    y=correlations.index,
    hue=correlations.index,
    palette="viridis",
    legend=False
)
plt.title("Корреляция признаков с popularity")
plt.xlabel("Коэффициент корреляции")
plt.ylabel("Признаки")
plt.show()



In [None]:
df["duration_min"] = df["duration_ms"] / 60000
df["energy_dance"] = df["energy"] * df["danceability"]
df["speech_instrument"] = df["speechiness"] * df["instrumentalness"]

# Проверим корреляции новых фичей
df[["duration_min", "energy_dance", "speech_instrument", "popularity"]].corr()["popularity"]


In [None]:
features = df.select_dtypes(include=["int64", "float64", "bool"]).drop(columns=["popularity"]).columns
X = df[features]
y = df["popularity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.head(10).plot(kind="barh", figsize=(10, 6), title="Топ-10 признаков по важности")
plt.show()


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)
}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring="r2")
    print(f"{name}: R² (mean) = {scores.mean():.4f} | std = {scores.std():.4f}")
