# Sistem Rekomendasi Sepatu Dengan Content-Based Filtering Menggunakan TF-IDF dan Cosine SImilarity

In [212]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random
import nltk

# Membaca file csv dataset yang akan digunakan
df_sepatu = pd.read_csv("E:/ML/project3.csv")

# Menampilkan isi dataset project3.csv
df_sepatu.head() 

Unnamed: 0.1,Unnamed: 0,Brands,Styles,Prices,Reduced_Prices,Available_in_Multiple_Colors
0,0,adidas,Nemeziz 19.3 TF,399,399 SAR,0
1,1,adidas,Pro Next 2019,375,375 SAR,0
2,2,Fila,Drifter Haze Slides,155,155 SAR,0
3,3,Robert Wood,Leather Cross Strap Sandals,159,159 SAR,0
4,4,Nike,Air Versitile IV,399,399 SAR,0


In [213]:
# Menampilkan informasi dari dataframe misalkan ada berapa data dan apa saja kolom yang ada
df_sepatu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1973 entries, 0 to 1972
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Unnamed: 0                    1973 non-null   int64 
 1   Brands                        1973 non-null   object
 2   Styles                        1973 non-null   object
 3   Prices                        1973 non-null   object
 4   Reduced_Prices                1973 non-null   object
 5   Available_in_Multiple_Colors  1973 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 92.6+ KB


In [214]:
# Menampilkan ringkasan statistik data
df_sepatu.describe()

Unnamed: 0.1,Unnamed: 0,Available_in_Multiple_Colors
count,1973.0,1973.0
mean,1544.992904,0.194627
std,846.214605,0.396014
min,0.0,0.0
25%,826.0,0.0
50%,1606.0,0.0
75%,2254.0,0.0
max,2941.0,1.0


In [215]:
# Untuk mengetahui isi informasi dari sepatu yang ada di dalam dataframe dengan menginputkan index yang kemudian ditampilkan
def print_datasepatu(index):
    hasil = df_sepatu[df_sepatu.index == index][['Brands', 'Styles', 'Prices', 'Available_in_Multiple_Colors']].values[0]
    if len(hasil) > 0:
        print('Nama Brand:', hasil[0])
        print('Model:', hasil[1]) 
        print('Harga:', hasil[2])
        if hasil[3] == 0:
            print('Tersedia Berbagai Warna: Tidak')
        else:
            print('Tersedia Berbagai Warna: Ya')

In [216]:
# Memanggil fungsi print_datasepatu untuk menampilkan sepatu sesuai index yang di-inputkan
# Menampilkan data sepatu yang memiliki index 24
print_datasepatu(24) 

Nama Brand: adidas
Model: Galaxy 4
Harga: 275 
Tersedia Berbagai Warna: Tidak


In [217]:
# Text processing pada kolom Brands dan Styles agar nanti dapat diproses menjadi angka dengan TF-IDF dan Cosine Similarity
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
stopworda = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom Styles
    return text

# Buat kolom tambahan untuk data Brands dan Styles yang telah dibersihkan   
df_sepatu['brands'] = df_sepatu['Brands'].apply(clean_text)
df_sepatu['styles'] = df_sepatu['Styles'].apply(clean_text)

# Menggabungkan kolom Brands dan Styles
def sepatu(x):
    return ''.join(x['brands']) + ' '  + ''.join(x['styles'])

# Menyimpan ke variabel baru gabungan kolom Brands dan Styles
df_sepatu['desc_clean'] = df_sepatu.apply(sepatu, axis=1)
df_sepatu.head()

Unnamed: 0.1,Unnamed: 0,Brands,Styles,Prices,Reduced_Prices,Available_in_Multiple_Colors,brands,styles,desc_clean
0,0,adidas,Nemeziz 19.3 TF,399,399 SAR,0,adidas,nemeziz 193 tf,adidas nemeziz 193 tf
1,1,adidas,Pro Next 2019,375,375 SAR,0,adidas,pro next 2019,adidas pro next 2019
2,2,Fila,Drifter Haze Slides,155,155 SAR,0,fila,drifter haze slides,fila drifter haze slides
3,3,Robert Wood,Leather Cross Strap Sandals,159,159 SAR,0,robert wood,leather cross strap sandals,robert wood leather cross strap sandals
4,4,Nike,Air Versitile IV,399,399 SAR,0,nike,air versitile iv,nike air versitile iv


In [218]:
# Untuk mengetahui isi informasi dari sepatu setelah data dibersihkan dengan menginputkan index yang kemudian ditampilkan
def print_datasepatu_clean(index):
    hasil = df_sepatu[df_sepatu.index == index][['Brands', 'Styles', 'Prices', 'Available_in_Multiple_Colors']].values[0]
    if len(hasil) > 0:
        print('Nama Brand:', hasil[0])
        print('Model:', hasil[1]) 
        print('Harga:', hasil[2])
        if hasil[3] == 0:
            print('Tersedia Berbagai Warna: Tidak')
        else:
            print('Tersedia Berbagai Warna: Ya')

In [219]:
# Memanggil fungsi print_datasepatu_clean untuk menampilkan sepatu sesuai index yang di-inputkan
# Menampilkan data sepatu yang memiliki index 21
print_datasepatu_clean(21)

Nama Brand: Seventy Five
Model: Graze Sneakers
Harga: 119 
Tersedia Berbagai Warna: Ya


# 2 Feature : Brands dan Styles

In [220]:
# Perhitungan TF-IDF
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_sepatu['desc_clean'])
tfidf_matrix.shape

(1973, 6925)

In [221]:
# Perhitungan Cosine Similarity untuk diubah ke matriks
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.0312039 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0312039 , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.07941065,
        0.09740071],
       [0.        , 0.        , 0.        , ..., 0.07941065, 1.        ,
        0.14731238],
       [0.        , 0.        , 0.        , ..., 0.09740071, 0.14731238,
        1.        ]])

In [222]:
# Untuk membuat set index utama di kolom Styles
indices = pd.Series(df_sepatu.Styles)

# Untuk menampilkan isi dari indices namun hanya 25 yang ditampilkan
indices[:25]

0                 Nemeziz 19.3 TF
1                   Pro Next 2019
2             Drifter Haze Slides
3     Leather Cross Strap Sandals
4                Air Versitile IV
5                     Md Runner 2
6                   Alex Sneakers
7             Espadrille Slip Ons
8                    Duramo Slide
9                          Zubazz
10                Eezay Flip Flop
11        Go Walk Evolution Ultra
12                 Trent Sneakers
13                    Go Walk Max
14                         Harsen
15                Revolution 4 EU
16                    Renew Arena
17            SB Check Solar Cnvs
18                Padded Sneakers
19                   Oxford Boots
20                   Rogue X Knit
21                 Graze Sneakers
22            Old School Sneakers
23                  Bane Sneakers
24                       Galaxy 4
Name: Styles, dtype: object

In [223]:
# Fungsi untuk merekomendasikan sepatu sesuai hasil dari TF-IDF dan cosine similarity
def rekomendasi_sepatu(masukan, cos_sim = cos_sim):
    
    sepatu_rekomendasi = []
    
    # Mengambil nama style sepatu berdasarkan variabel indicies
    idx = indices[indices == masukan].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # Mengambil index dan dibuat 30 baris rekomendasi terbaik
    top_30_indexes = list(score_series.iloc[1:31].index)
    
    for i in top_30_indexes:
        sepatu_rekomendasi.append(list("Nama Brands:" + df_sepatu.Brands)[i])
        sepatu_rekomendasi.append(list("Nama Style :" + df_sepatu.Styles)[i])
        #sepatu_rekomendasi.append(list(df_sepatu.Prices)[i])
        
    return sepatu_rekomendasi

In [224]:
rekomendasi_sepatu('Zubazz')

['Nama Brands:Skechers',
 'Nama Style :Go Walk 4',
 'Nama Brands:Skechers',
 'Nama Style :Go Walk 4',
 'Nama Brands:Skechers',
 'Nama Style :Go Walk 5',
 'Nama Brands:Skechers',
 'Nama Style :Equalizer',
 'Nama Brands:Skechers',
 'Nama Style :Bounder',
 'Nama Brands:Skechers',
 'Nama Style :Bounder',
 'Nama Brands:Skechers',
 'Nama Style :Bounder',
 'Nama Brands:Skechers',
 'Nama Style :Bounder',
 'Nama Brands:Skechers',
 'Nama Style :Matera',
 'Nama Brands:Skechers',
 'Nama Style :Overhaul',
 'Nama Brands:Skechers',
 'Nama Style :Matera',
 'Nama Brands:Skechers',
 "Nama Style :D'Lites",
 'Nama Brands:Skechers',
 'Nama Style :Matera',
 'Nama Brands:Skechers',
 'Nama Style :Harper',
 'Nama Brands:Skechers',
 'Nama Style :Harper',
 'Nama Brands:Skechers',
 'Nama Style :Harper',
 'Nama Brands:Skechers',
 'Nama Style :Marauder',
 'Nama Brands:Skechers',
 'Nama Style :Quillin',
 'Nama Brands:Skechers',
 'Nama Style :Drafter',
 'Nama Brands:Skechers',
 'Nama Style :Terrabite',
 'Nama Brands:

In [225]:
rekomendasi_sepatu('Eezay Flip Flop')

['Nama Brands:Puma',
 'Nama Style :Cozy Flip',
 'Nama Brands:Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands:Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands:Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands:Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands:Hype',
 'Nama Style :Casual Flip Flops',
 'Nama Brands:Havaianas',
 'Nama Style :Logo Flip Flops',
 'Nama Brands:Reserved',
 'Nama Style :Essential Flip Flops',
 'Nama Brands:Aldo',
 'Nama Style :Qyllan Flip Flops',
 'Nama Brands:Aldo',
 'Nama Style :Qyllan Flip Flops',
 'Nama Brands:Reebok',
 'Nama Style :Cash Flip Flops',
 'Nama Brands:Reebok',
 'Nama Style :Cash Flip Flops',
 'Nama Brands:Call It Spring',
 'Nama Style :Jickson Flip Flops',
 'Nama Brands:Call It Spring',
 'Nama Style :Jickson Flip Flops',
 'Nama Brands:Only & Sons',
 'Nama Style :Essential Flip Flops',
 'Nama Brands:Only & Sons',
 'Nama Style :Essential Flip Flops',
 'Nama Brands:Hype',
 'Nama Style :Script Flip Flops

In [226]:
rekomendasi_sepatu('Oxford Boots')

['Nama Brands:Robert Wood',
 'Nama Style :Casual Boots',
 'Nama Brands:Robert Wood',
 'Nama Style :Casual Boots',
 'Nama Brands:Robert Wood',
 'Nama Style :Formal Boots',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Oxford Lace Ups',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Oxford Lace Ups',
 'Nama Brands:Robert Wood',
 'Nama Style :Chukka Boots',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Brogue Oxford Lace Ups',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Brogue Oxford Lace Ups',
 'Nama Brands:Robert Wood',
 'Nama Style :Perforated Toe Cap Oxford Lace Ups',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Moccasins',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Moccasins',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Moccasins',
 'Nama Brands:Robert Wood',
 'Nama Style :Classic Moccasins',
 'Nama Brands:Robert Wood',
 'Nama Style :Faux Suede and Leather Chelsea Boots',
 'Nama Brands:Robert Wood',
 'Nama Style :Casual Loafers',
 'Nama Brands:Rober

# 1 Feature : Styles

In [227]:
# Perhitungan TF-IDF
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(df_sepatu['styles'])
tfidf_matrix1.shape

(1973, 4298)

In [228]:
# Perhitungan Cosine Similarity untuk diubah ke matriks
cos_sim1 = cosine_similarity(tfidf_matrix1, tfidf_matrix1)
cos_sim1

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.07629403],
       [0.        , 0.        , 0.        , ..., 0.        , 0.07629403,
        1.        ]])

In [229]:
# Untuk membuat set index utama di kolom Styles
indices1 = pd.Series(df_sepatu.Styles)

In [230]:
# Untuk menampilkan isi dari indices namun hanya 25 yang ditampilkan
indices1[:25]

0                 Nemeziz 19.3 TF
1                   Pro Next 2019
2             Drifter Haze Slides
3     Leather Cross Strap Sandals
4                Air Versitile IV
5                     Md Runner 2
6                   Alex Sneakers
7             Espadrille Slip Ons
8                    Duramo Slide
9                          Zubazz
10                Eezay Flip Flop
11        Go Walk Evolution Ultra
12                 Trent Sneakers
13                    Go Walk Max
14                         Harsen
15                Revolution 4 EU
16                    Renew Arena
17            SB Check Solar Cnvs
18                Padded Sneakers
19                   Oxford Boots
20                   Rogue X Knit
21                 Graze Sneakers
22            Old School Sneakers
23                  Bane Sneakers
24                       Galaxy 4
Name: Styles, dtype: object

In [231]:
# Fungsi untuk merekomendasikan sepatu sesuai hasil dari TF-IDF dan cosine similarity
def rekomendasi_sepatu_1(masukan, cos_sim1 = cos_sim1):
    
    sepatu_rekomendasi_1 = []
    
    # Mengambil nama style sepatu berdasarkan variabel indicies
    idx1 = indices1[indices1 == masukan].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series1 = pd.Series(cos_sim1[idx1]).sort_values(ascending = False)

    # Mengambil index dan dibuat 30 baris rekomendasi terbaik
    top_30_indexes1 = list(score_series1.iloc[1:31].index)
    
    for i in top_30_indexes1:
        sepatu_rekomendasi_1.append(list("Nama Brands :" + df_sepatu.Brands)[i])
        sepatu_rekomendasi_1.append(list("Nama Style :" + df_sepatu.Styles)[i])
        #sepatu_rekomendasi_1.append(list(df_sepatu.Prices)[i])
        
    return sepatu_rekomendasi_1

In [232]:
rekomendasi_sepatu_1('Zubazz')

['Nama Brands :FitFlop',
 'Nama Style :Ryker Sandals',
 'Nama Brands :Skechers',
 'Nama Style :Relven',
 'Nama Brands :Seventy Five',
 'Nama Style :Check Print Soft Bedroom Slippers',
 'Nama Brands :Tommy Hilfiger',
 'Nama Style :Lightweight Knit Flag Sneakers',
 'Nama Brands :Lacoste',
 'Nama Style :Maricle Slipons',
 'Nama Brands :Tommy Jeans',
 'Nama Style :Plimsolls  Slip Ons',
 'Nama Brands :Burton',
 'Nama Style :Casual Slip Ons',
 'Nama Brands :Burton',
 'Nama Style :Casual Slip Ons',
 'Nama Brands :Superdry',
 'Nama Style :Trophy 2.0 Flip Flops',
 'Nama Brands :Call It Spring',
 'Nama Style :Ybiari Slip Ons',
 'Nama Brands :Ecco',
 'Nama Style :Dress Moccasins',
 'Nama Brands :Skechers',
 'Nama Style :Go Run 600',
 'Nama Brands :Jack & Jones',
 'Nama Style :Thai Sneakers',
 'Nama Brands :Jack & Jones',
 'Nama Style :Nimbus Canvas Mix Sneakers',
 'Nama Brands :Skechers',
 "Nama Style :D'Lites",
 'Nama Brands :adidas Originals',
 'Nama Style :Swift Run',
 'Nama Brands :Brave Soul

In [233]:
rekomendasi_sepatu_1('Eezay Flip Flop')

['Nama Brands :Puma',
 'Nama Style :Cozy Flip',
 'Nama Brands :Hype',
 'Nama Style :Casual Flip Flops',
 'Nama Brands :Emporio Armani',
 'Nama Style :Casual Flip Flops',
 'Nama Brands :Jack & Jones',
 'Nama Style :Logo Flip Flops',
 'Nama Brands :Havaianas',
 'Nama Style :Logo Flip Flops',
 'Nama Brands :Jack & Jones',
 'Nama Style :Logo Flip Flops',
 'Nama Brands :Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands :Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands :Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands :Havaianas',
 'Nama Style :Classic Flip Flops',
 'Nama Brands :Only & Sons',
 'Nama Style :Essential Flip Flops',
 'Nama Brands :Reserved',
 'Nama Style :Essential Flip Flops',
 'Nama Brands :Only & Sons',
 'Nama Style :Essential Flip Flops',
 'Nama Brands :Brave Soul',
 'Nama Style :Striped Flip Flops',
 'Nama Brands :Hype',
 'Nama Style :Script Flip Flops',
 'Nama Brands :Hype',
 'Nama Style :Script Flip Flops',
 'Nama Brands :Havaianas',
 '

In [234]:
rekomendasi_sepatu_1('Oxford Boots')

['Nama Brands :Seventy Five',
 'Nama Style :Oxford Sneakers',
 'Nama Brands :Seventy Five',
 'Nama Style :Oxford Sneakers',
 'Nama Brands :Robert Wood',
 'Nama Style :Casual Boots',
 'Nama Brands :J.Crew',
 'Nama Style :Casual Boots',
 'Nama Brands :Robert Wood',
 'Nama Style :Casual Boots',
 'Nama Brands :Tommy Hilfiger',
 'Nama Style :Casual Boots',
 'Nama Brands :Robert Wood',
 'Nama Style :Formal Boots',
 'Nama Brands :Burton',
 'Nama Style :Formal Boots',
 'Nama Brands :Jack & Jones',
 'Nama Style :Nubuck Boots',
 'Nama Brands :Robert Wood',
 'Nama Style :Chukka Boots',
 'Nama Brands :Timberland',
 'Nama Style :Killington 6" Boots',
 'Nama Brands :Timberland',
 'Nama Style :Killington 6 In Boots',
 'Nama Brands :Jack & Jones',
 'Nama Style :Stoke  Boots',
 'Nama Brands :Call It Spring',
 'Nama Style :Malsburg Boots',
 'Nama Brands :Jack & Jones',
 'Nama Style :Albany Boots',
 'Nama Brands :Kurt Geiger London',
 'Nama Style :Billington Boots',
 'Nama Brands :Aldo',
 'Nama Style :Bo