<a href="https://colab.research.google.com/github/rafsanalhad/machine_learning_jobsheet/blob/main/Skripsi_No_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ========================
# 0. Install & import
# ========================
!pip install kagglehub nltk scikit-learn --quiet

import kagglehub
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

# ========================
# 1. Download dataset Steam
# ========================
path = kagglehub.dataset_download("trolukovich/steam-games-complete-dataset")
csv_file = "steam_games.csv"
csv_path = os.path.join(path, csv_file)
df = pd.read_csv(csv_path)
df = df.dropna(subset=['desc_snippet','genre'])
games = df[['name','genre','desc_snippet']].head(40000)
games.rename(columns={'desc_snippet':'about_the_game'}, inplace=True)

# ========================
# 2. Preprocessing teks
# ========================
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = [w for w in text.split() if w not in STOPWORDS]
    return ' '.join(tokens)

games['clean_text'] = games['about_the_game'].apply(preprocess)

# ========================
# 3. TF-IDF + Cosine Similarity untuk rekomendasi
# ========================
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(games['clean_text'])

def recommend(query, topk=5):
    query_clean = preprocess(query)
    q_vec = vectorizer.transform([query_clean])
    sims = cosine_similarity(q_vec, tfidf_matrix)[0]
    top_idx = sims.argsort()[::-1][:topk]
    return [(games.iloc[i]['name'], sims[i]) for i in top_idx]

# ========================
# 4. Contoh penggunaan
# ========================
query = "genshin impact"
print("\n=== Rekomendasi Game Mirip Query ===")
for name, score in recommend(query):
    print(f"{name} (score: {score:.4f})")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using Colab cache for faster access to the 'steam-games-complete-dataset' dataset.

=== Rekomendasi Game Mirip Query ===
Stellar Impact Bundle (score: 0.5441)
Stellar Impact (score: 0.3596)
The Fruitless Flower 雾雨中的徒花 (score: 0.3504)
Iron Impact (score: 0.3477)
Dodgeball Simulator VR (score: 0.3352)
