In [1]:
import os
import sys
import sqlite3
import pandas as pd
import spacy
import joblib
import argparse
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from gensim.models import FastText


In [2]:
DB_PATH         = 'data/songs.db'
CSV_PATH        = 'data/genredata.csv'
VECTORIZER_FP   = 'models/tfidf_vectorizer.pkl'
FT_MODEL_FP     = 'models/fasttext.model'
SVM_MODEL_FP    = 'models/svm_genre.pkl'
RF_MODEL_FP     = 'models/rf_genre.pkl'

In [5]:
def init_db():
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.executescript("""
      CREATE TABLE IF NOT EXISTS songs (
        id           INTEGER PRIMARY KEY,
        title        TEXT    NOT NULL COLLATE NOCASE,
        artist       TEXT    NOT NULL COLLATE NOCASE,
        year         INTEGER,
        genre        TEXT    NOT NULL COLLATE NOCASE,
        lyrics_raw   TEXT    NOT NULL,
        lyrics_clean TEXT
      );
      CREATE TABLE IF NOT EXISTS features (
        song_id   INTEGER PRIMARY KEY REFERENCES songs(id) ON DELETE CASCADE,
        tfidf     BLOB    NOT NULL,
        embedding BLOB    NOT NULL
      );
      CREATE TABLE IF NOT EXISTS results (
        song_id         INTEGER REFERENCES songs(id) ON DELETE CASCADE,
        predicted_genre TEXT,
        probability     REAL,
        timestamp       TEXT    DEFAULT (datetime('now','localtime')),
        PRIMARY KEY(song_id, timestamp)
      );
    """)
    conn.commit()
    conn.close()
    print("✔ База даних готова:", DB_PATH)

In [6]:
def import_csv():
    df = pd.read_csv(CSV_PATH)
    conn = sqlite3.connect(DB_PATH)
    df[['id','title','artist','year','genre','lyrics_raw']].to_sql(
        'songs', conn, if_exists='append', index=False
    )
    conn.close()
    print(f"✔ Імпортували {len(df)} рядків у songs")

In [7]:
# ——— Препроцесінг тексту —————————————————————————————
class Preprocessor:
    def __init__(self):
        self.nlp = spacy.load('uk_core_news_lg', disable=['parser','ner','textcat'])
    def clean(self, text):
        return text.replace('\n',' ').strip()
    def lemmatize(self, text):
        doc = self.nlp(text)
        return ' '.join(tok.lemma_ for tok in doc if not tok.is_punct and not tok.is_space)
    def run_all(self):
        conn = sqlite3.connect(DB_PATH)
        cur = conn.cursor()
        cur.execute("SELECT id, lyrics_raw FROM songs WHERE lyrics_clean IS NULL;")
        rows = cur.fetchall()
        for sid, raw in rows:
            lem = self.lemmatize(self.clean(raw))
            cur.execute("UPDATE songs SET lyrics_clean = ? WHERE id = ?;", (lem, sid))
        conn.commit()
        conn.close()
        print(f"✔ Препроцесінг застосовано до {len(rows)} пісень")

# ——— Екстракція ознак ———————————————————————————————
class FeatureBuilder:
    def __init__(self):
        self.tfidf = None
        self.fasttext = None

    def fit(self):
        conn = sqlite3.connect(DB_PATH)
        df = pd.read_sql("SELECT id, lyrics_clean FROM songs;", conn)
        conn.close()
        corpus = df['lyrics_clean'].tolist()
        # TF-IDF
        self.tfidf = TfidfVectorizer(min_df=2, max_df=0.8, ngram_range=(1,2))
        tfidf_matrix = self.tfidf.fit_transform(corpus)
        # FastText
        tokenized = [txt.split() for txt in corpus]
        self.fasttext = FastText(vector_size=100, window=5, min_count=2)
        self.fasttext.build_vocab(tokenized)
        self.fasttext.train(tokenized, total_examples=len(tokenized), epochs=5)
        # Збереження
        os.makedirs(os.path.dirname(VECTORIZER_FP), exist_ok=True)
        joblib.dump(self.tfidf, VECTORIZER_FP)
        self.fasttext.save(FT_MODEL_FP)
        # Запис у БД
        conn = sqlite3.connect(DB_PATH)
        cur = conn.cursor()
        for i, sid in enumerate(df['id']):
            tfidf_vec = tfidf_matrix[i].toarray()[0]
            ft_vec = np.mean([self.fasttext.wv[w] for w in tokenized[i]], axis=0)
            cur.execute("INSERT OR REPLACE INTO features VALUES (?, ?, ?);",
                        (sid,
                         sqlite3.Binary(joblib.dumps(tfidf_vec)),
                         sqlite3.Binary(joblib.dumps(ft_vec))))
        conn.commit()
        conn.close()
        print("✔ Ознаки збережено в features")

In [8]:
class GenreClassifier:
    def __init__(self):
        self.tfidf = joblib.load(VECTORIZER_FP)
        self.fasttext = FastText.load(FT_MODEL_FP)
        self.svm = None
        self.rf = None

    def load_features(self):
        conn = sqlite3.connect(DB_PATH)
        df_f = pd.read_sql("SELECT song_id, tfidf, embedding FROM features;", conn)
        df_s = pd.read_sql("SELECT id, genre FROM songs;", conn)
        conn.close()
        X_tfidf = np.vstack(df_f['tfidf'].apply(joblib.loads).values)
        X_ft = np.vstack(df_f['embedding'].apply(joblib.loads).values)
        X = np.hstack([X_tfidf, X_ft])
        y = df_s.set_index('id').loc[df_f['song_id'], 'genre'].values
        return train_test_split(X, y, test_size=0.2, random_state=42)

    def train_and_compare(self):
        X_train, X_test, y_train, y_test = self.load_features()
        # SVM
        self.svm = LinearSVC(C=1.0)
        self.svm.fit(X_train, y_train)
        y_pred_svm = self.svm.predict(X_test)
        print("=== SVM Results ===")
        print(classification_report(y_test, y_pred_svm))
        # RandomForest
        self.rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
        self.rf.fit(X_train, y_train)
        y_pred_rf = self.rf.predict(X_test)
        print("=== RandomForest Results ===")
        print(classification_report(y_test, y_pred_rf))
        # Збереження моделей
        joblib.dump(self.svm, SVM_MODEL_FP)
        joblib.dump(self.rf, RF_MODEL_FP)
        print("✔ Моделі збережено:", SVM_MODEL_FP, RF_MODEL_FP)

    def predict(self, text, model='svm'):
        lem = Preprocessor().lemmatize(Preprocessor().clean(text))
        v1 = self.tfidf.transform([lem]).toarray()[0]
        v2 = np.mean([self.fasttext.wv[w] for w in lem.split()], axis=0)
        vec = np.hstack([v1, v2])
        if model == 'svm':
            clf = joblib.load(SVM_MODEL_FP)
            prob = max(clf.decision_function([vec])[0])
        else:
            clf = joblib.load(RF_MODEL_FP)
            proba = clf.predict_proba([vec])[0]
            prob = float(max(proba))
        genre = clf.predict([vec])[0]
        return genre, prob


In [9]:
def main():
    p = argparse.ArgumentParser(description="Genre Classifier Pipeline")
    p.add_argument('cmd', choices=['init','import','preproc','features','train','predict'])
    p.add_argument('--text', help='Текст для predict')
    p.add_argument('--model', choices=['svm','rf'], default='svm',
                   help='Модель для прогнозу: svm або rf (за замовчуванням svm)')
    args = p.parse_args()

    if args.cmd == 'init':
        init_db()
    elif args.cmd == 'import':
        import_csv()
    elif args.cmd == 'preproc':
        Preprocessor().run_all()
    elif args.cmd == 'features':
        FeatureBuilder().fit()
    elif args.cmd == 'train':
        GenreClassifier().train_and_compare()
    elif args.cmd == 'predict':
        if not args.text:
            print("Помилка: вкажіть --text 'ваш текст'")
            sys.exit(1)
        genre, prob = GenreClassifier().predict(args.text, model=args.model)
        print(f"🎯 Модель={args.model} → Жанр: {genre}, ймовірність: {prob:.2f}")
    else:
        p.print_help()

if __name__ == '__main__':
    main()

usage: ipykernel_launcher.py [-h] [--text TEXT] [--model {svm,rf}]
                             {init,import,preproc,features,train,predict}
ipykernel_launcher.py: error: the following arguments are required: cmd


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
