In [None]:
import pandas as pd

# 1. CARGAR EL DATASET

file_path = 'mal_anime_2025.csv'

try:
    df = pd.read_csv(file_path)
    print("¬°Archivo cargado con √©xito!")
except FileNotFoundError:
    print(f"Error: No encontr√© el archivo '{file_path}'. Aseg√∫rate de que est√© en la misma carpeta.")
    exit()

# 2. INSPECCI√ìN B√ÅSICA
print("\n--- Informaci√≥n General ---")
print(f"Total de Animes: {df.shape[0]}")
print(f"Columnas disponibles: {list(df.columns)}")

# 3. LIMPIEZA R√ÅPIDA
df = df.drop_duplicates(subset=['title'])
df = df.dropna(subset=['title'])

# 4. TOP 10 ANIMES POR PUNTUACI√ìN (Score)
if 'Score' in df.columns:
    top_10 = df.sort_values(by='Score', ascending=False).head(10)
    print("\n--- Top 10 Animes Mejor Calificados ---")
    print(top_10[['title', 'Score']])
else:
    print("\nLa columna 'score' no existe. Aqu√≠ est√°n las primeras 10 filas:")
    print(df[['title']].head(10))

ModuleNotFoundError: No module named 'pandas'

In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

#CLEANING FUNCTION
def clean_description(text):
    if pd.isna(text) or text == "":
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    cleaned_words = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return " ".join(cleaned_words)

# 2. CLEANING
df['clean_desc'] = df['description'].apply(clean_description)

# 3. PREVIEW THE DIFFERENCE
print("\n--- BEFORE ---")
print(df['description'].iloc[0][:1500])
print("\n--- AFTER ---")
print(df['clean_desc'].iloc[0][:1500])

print("\nText is now 'Machine Readable'!")

üßπ Cleaning descriptions... this takes about 10-20 seconds.

--- BEFORE ---
Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Eart

--- AFTER ---
crime timeless year humanity expanded galaxy filling surface planets settlements like earth new societies plagued murder drug use theft intergalactic 

‚úÖ Text is now 'Machine Readable'!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. INITIALIZE THE VECTORIZER
# max_features=5000: We only care about the top 5000 most important words
# ngram_range=(1,2): This catches single words ("ninja") AND pairs ("high school")
tfidf = TfidfVectorizer(max_features=8000, ngram_range=(1, 3))

# 2. TRANSFORM THE TEXT INTO MATH
print("Transforming text into a mathematical matrix...")
# We use the 'clean_desc' column we created in the previous cell
tfidf_matrix = tfidf.fit_transform(df['clean_desc'])

# 3. CHECK THE SHAPE
print(f"Matrix Shape: {tfidf_matrix.shape}")
print("Every anime is now represented by a vector of 8,000 numbers!")

üî¢ Transforming text into a mathematical matrix...
Matrix Shape: (19930, 8000)
‚úÖ Every anime is now represented by a vector of 5,000 numbers!


In [7]:
# Get the feature names (the 8000 words/phrases)
feature_names = tfidf.get_feature_names_out()

# Print a small sample of the words it found
print("--- Sample of the 8,000 Features ---")
print(feature_names[1000:1020]) # Look at a slice in the middle

--- Sample of the 8,000 Features ---
['change fate' 'change life' 'changed' 'changes' 'changesource' 'changing'
 'channel' 'chao' 'chaos' 'chaotic' 'chapter' 'chapters' 'char'
 'character' 'character designs' 'characters' 'characters posted' 'charge'
 'charged' 'charismatic']


In [8]:
# Look at the first anime in your dataframe
anime_title = df['title'].iloc[0]
first_vector = tfidf_matrix[0]

# Convert the sparse row to a dense format and sort by score
df_tfidf = pd.DataFrame(first_vector.T.todense(), index=feature_names, columns=["tfidf_score"])
top_keywords = df_tfidf.sort_values(by="tfidf_score", ascending=False).head(10)

print(f"\n--- Top Keywords for: {anime_title} ---")
print(top_keywords)


--- Top Keywords for: Cowboy Bebop ---
               tfidf_score
jet               0.276636
criminals         0.235346
past              0.161415
expanded          0.147836
edward            0.146733
iv                0.146733
theft             0.143777
disrupted         0.142889
taking care       0.140460
intergalactic     0.138318


In [13]:
print(f"\n--- Top Keywords for: {anime_title} ---")
print(top_keywords)


--- Top Keywords for: Cowboy Bebop ---
               tfidf_score
jet               0.276636
criminals         0.235346
past              0.161415
expanded          0.147836
edward            0.146733
iv                0.146733
theft             0.143777
disrupted         0.142889
taking care       0.140460
intergalactic     0.138318


In [None]:
import re

def purge_mal_metadata(text):
    # 1. Handle empty data
    if pd.isna(text) or text == "":
        return ""
    
    # 2. Cut off the top "website junk"
    # Everything before the word 'Synopsis' is usually MAL UI text
    if "Synopsis" in text:
        text = text.split("Synopsis")[-1]
    
    # 3. Remove the voting buttons/scale
    # These words appear in the text because of how the data was scraped
    bad_words = ["Masterpiece", "Great", "Very Good", "Good", "Fine", 
                 "Average", "Bad", "Very Bad", "Horrible", "Appalling"]
    for word in bad_words:
        text = text.replace(word, "")
        
    # 4. Remove technical UI strings
    # Removes patterns like 'Episodes: /26' or 'PV 1 English dub'
    text = re.sub(r'Episodes:\s?/\d+', '', text)
    text = re.sub(r'PV \d+ English dub version', '', text)
    text = text.replace("Add to My ListSelect", "")
    text = text.replace("playMore videosEdit", "")
    
    return text.strip()

# Apply the function to the 'description' column
print("Purging website metadata from descriptions...")
df['description'] = df['description'].apply(purge_mal_metadata)

# Verify the result for the first anime
print("\n--- Cleaned Description Preview ---")
print(df['description'].iloc[0][:2000])

üßπ Purging website metadata from descriptions...

--- Cleaned Description Preview ---
Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.Spike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh corgi.While developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted by a menace from Spike's past. As a rival's maniacal plot continues to unravel

In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def final_text_cleaning(text):
    if pd.isna(text) or text == "":
        return ""
    # 1. Lowercase everything
    text = text.lower()
    # 2. Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # 3. Split into words and remove Stop Words (the, is, at, etc.)
    words = text.split()
    cleaned_words = [w for w in words if w not in ENGLISH_STOP_WORDS]
    # 4. Join back into a string
    return " ".join(cleaned_words)

print("Updating keywords with the new clean descriptions...")
df['clean_desc'] = df['description'].apply(final_text_cleaning)

# Preview to ensure the metadata junk is gone from the keywords too
print("\n--- Final Keywords for Cowboy Bebop ---")
print(df['clean_desc'].iloc[0][:2000])

üîÑ Updating keywords with the new clean descriptions...

--- Final Keywords for Cowboy Bebop ---
crime timeless year humanity expanded galaxy filling surface planets settlements like earth new societies plagued murder drug use theft intergalactic outlaws hunted growing number tough bounty huntersspike spiegel jet black pursue criminals space make humble living beneath goofy aloof demeanor spike haunted weight violent past jet manages troubled memories taking care spike bebop ship duo joined beautiful artist faye valentine odd child edward wong hau pepelu tivrusky iv ein bioengineered welsh corgiwhile developing bonds working catch colorful cast criminals bebop crews lives disrupted menace spikes past rivals maniacal plot continues unravel spike choose life newfound family revenge old woundswritten mal rewrite


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import umap

# 1. RE-VECTORIZE (Turning keywords into math)
print("üî¢ Step 1: Transforming 8,000 keywords into math matrix...")
tfidf = TfidfVectorizer(max_features=8000, ngram_range=(1, 3))
tfidf_matrix = tfidf.fit_transform(df['clean_desc'])

# 2. RE-RUN UMAP (Squashing 8,000 dimensions into 2)
print("üöÄ Step 2: Generating new Map Coordinates... (Wait 1-3 mins)")
reducer = umap.UMAP(
    n_neighbors=15, 
    min_dist=0.1, 
    metric='cosine', 
    random_state=42
)
embedding = reducer.fit_transform(tfidf_matrix)

# 3. SAVE TO DATAFRAME
df['x'] = embedding[:, 0]
df['y'] = embedding[:, 1]

print("\n‚úÖ Success! The Anime Universe has been mapped.")
print(df[['title', 'x', 'y']].head())

  from .autonotebook import tqdm as notebook_tqdm


üî¢ Step 1: Transforming 8,000 keywords into math matrix...
üöÄ Step 2: Generating new Map Coordinates... (Wait 1-3 mins)


  warn(



‚úÖ Success! The Anime Universe has been mapped.
                             title         x         y
0                     Cowboy Bebop  7.123980  2.583071
1  Cowboy Bebop: Tengoku no Tobira  9.437322  0.294572
2                           Trigun  7.163458  2.999876
3               Witch Hunter Robin  8.598219  1.820902
4                   Bouken Ou Beet  9.515733  2.111956


In [15]:
# 1. Ensure the dataframe is clean and indexed properly
df = df.drop_duplicates(subset=['title']).reset_index(drop=True)

# 2. Re-run the Vectorizer on the CURRENT clean_desc
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=8000, ngram_range=(1, 3))
tfidf_matrix = tfidf.fit_transform(df['clean_desc'])
feature_names = tfidf.get_feature_names_out()

# 3. Verification Test for Row 0
anime_title = df['title'].iloc[0]
first_vector = tfidf_matrix[0]

# Convert and show
df_tfidf = pd.DataFrame(first_vector.T.todense(), index=feature_names, columns=["tfidf_score"])
top_keywords = df_tfidf.sort_values(by="tfidf_score", ascending=False).head(10)

print(f"Index 0 Title: {anime_title}")
print("\n--- Verified Top Keywords ---")
print(top_keywords)

Index 0 Title: Cowboy Bebop

--- Verified Top Keywords ---
               tfidf_score
jet               0.276633
criminals         0.235344
past              0.161414
expanded          0.147834
iv                0.146731
edward            0.146731
theft             0.143775
disrupted         0.142888
taking care       0.140459
intergalactic     0.138317
