# Subiectul 1 – Scraping Metacritic
Script Python care extrage, din 10 pagini Metacritic, următoarele câmpuri pentru fiecare film:
- regizorul
- scorul criticilor (**Metascore**)
- scorul utilizatorilor (**User Score**)
- durata filmului
- titlul și URL‑ul paginii

Rulează celula de mai jos după ce completezi lista `URLS` cu 10 link‑uri valide de film.

In [None]:

import time, requests, pandas as pd
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36"
    )
}

# ⇣⇣⇣ Înlocuiește cu 10 URL‑uri de film Metacritic
URLS = [
    "https://www.metacritic.com/movie/inside-out-2",
    "https://www.metacritic.com/movie/dune-part-two",
    # … alte 8
]

def parse_movie(url: str) -> dict:
    """Returnează metadatele unui film."""
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else url.split("/")[-1]

    m_tag = soup.select_one("span.metascore_w.larger.movie")         or soup.select_one("span.metascore_w.xlarge.movie")
    critic = int(m_tag.text.strip()) if m_tag else None

    u_tag = soup.select_one("div.metascore_w.user")
    try:
        user = float(u_tag.text.strip()) if u_tag else None
    except ValueError:
        user = None

    run_tag = soup.find("li", class_="runtime")
    runtime = run_tag.get_text(strip=True) if run_tag else None

    dir_tag = soup.find("li", class_="director")
    if dir_tag:
        director = " ".join(dir_tag.stripped_strings).replace("Director:", "").strip()
    else:
        lbl = soup.find("span", string=lambda t: t and "Director" in t)
        director = lbl.find_next("span").get_text(strip=True) if lbl else None

    return {
        "title": title,
        "director": director,
        "critic_score": critic,
        "user_score": user,
        "runtime": runtime,
        "url": url,
    }

rows = []
for link in URLS:
    try:
        rows.append(parse_movie(link))
        print("✓", link)
    except Exception as exc:
        print("✗", link, exc)
    time.sleep(2)  # politeness delay

df = pd.DataFrame(rows)
df.to_csv("metacritic_films.csv", index=False)
df.head()


# Subiectul 2 – Arbore de decizie (Congressional Voting Records)
Se folosește setul de date **Congressional Voting Records**. Ultimele 10 % din rânduri devin set de test, iar restul – set de antrenare.
Celula de mai jos descarcă datele (dacă nu există local), antrenează un arbore de decizie, afișează acuratețea și salvează o imagine cu arborele.

In [None]:

import pandas as pd, matplotlib.pyplot as plt, urllib.request
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report

DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"
LOCAL = Path("house-votes-84.data")

if not LOCAL.exists():
    urllib.request.urlretrieve(DATA_URL, LOCAL)

COLS = ["class"] + [f"vote_{i}" for i in range(16)]
df = pd.read_csv(LOCAL, header=None, names=COLS)

df.replace({"y": 1, "n": 0, "?": pd.NA}, inplace=True)
df = df.dropna()
print(f"Rânduri după curățare: {len(df)}")

split = int(len(df) * 0.9)
train, test = df.iloc[:split], df.iloc[split:]

X_train, y_train = train.drop("class", axis=1), train["class"]
X_test , y_test  = test.drop("class", axis=1),  test["class"]

tree = DecisionTreeClassifier(criterion="entropy", random_state=0)
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)
print("Acuratețe:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

# importanța trăsăturilor
imp = pd.Series(tree.feature_importances_, index=X_train.columns).sort_values(ascending=False)
display(imp.to_frame("Feature importance").head(10))

# Vizualizează arborele
plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=X_train.columns, class_names=tree.classes_, filled=True)
plt.title("Decision Tree – Congressional Voting")
plt.show()

plt.savefig("voting_tree.png", dpi=150, bbox_inches="tight")
print("Grafic salvat -> voting_tree.png")


# Subiectul 3 – Clustering pe setul **Iris**
Aplicăm `KMeans` cu *k = 3*, apoi evaluăm cât de bine corespund clusterele speciilor reale folosind **Adjusted Rand Index** și **Silhouette Score**.

In [None]:

from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, silhouette_score
import pandas as pd

iris = load_iris(as_frame=True)
X = iris.data
y = iris.target
species_names = iris.target_names

# Standardize
X_std = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=3, n_init="auto", random_state=0)
labels = kmeans.fit_predict(X_std)

ari = adjusted_rand_score(y, labels)
sil = silhouette_score(X_std, labels)

print(f"Adjusted Rand Index: {ari:.3f}")
print(f"Silhouette Score   : {sil:.3f}\n")

# tabel confuzie cluster vs specie
cross = pd.crosstab(y, labels, rownames=["Specie reală"], colnames=["Cluster"])
display(cross)
