*FINAL PIPELINE*

In [3]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack 
from textblob import TextBlob

In [4]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

if project_root not in sys.path:
    sys.path.append(project_root)

from src import config
PROCESSED_DATA_PATH = config.PROCESSED_DATA_PATH
MODEL_DIR = config.MODEL_PATH
VECTORIZER_PATH = config.VECTORIZER_PATH
SCALER_PATH = config.SCALER_PATH

In [5]:
try:
    df = pd.read_csv(PROCESSED_DATA_PATH)
except FileNotFoundError:
    print("File not found.")
    exit

In [6]:
df["description"] = df["description"].fillna("")
df["rating"] = df["rating"].fillna("")

In [7]:
df["combined_text"] = df["description"] + " " + df["tags"]

In [8]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000, ngram_range=(1, 2))

In [9]:
tfidf_matrix = vectorizer.fit_transform(df["combined_text"])
tfidf_matrix.shape

(70948, 10000)

In [10]:
target_genres = ["Romance", "Comedy", "Drama", "Fantasy", "Action", "School Life", "Seinen", "Shoujo", "Shounen", "Josei"]

In [11]:
numerical_features = ["sentiment_score", "age"] + [f"is_{g}" for g in target_genres]

In [12]:
scaler = MinMaxScaler()

In [13]:
num_matrix = scaler.fit_transform(df[numerical_features])
num_matrix.shape

(70948, 12)

In [14]:
final_matrix = hstack([tfidf_matrix, num_matrix])
final_matrix.shape

(70948, 10012)

In [15]:
joblib.dump(final_matrix, VECTORIZER_PATH)

['c:\\Users\\aleyna nur\\Desktop\\Manga_Manhwa_Manhua_Rec\\models\\vectorizer.pkl']

In [16]:
joblib.dump(vectorizer, VECTORIZER_PATH)
joblib.dump(scaler, SCALER_PATH)

['c:\\Users\\aleyna nur\\Desktop\\Manga_Manhwa_Manhua_Rec\\models\\scaler.pkl']