<a href="https://colab.research.google.com/github/robbieadam/sistemrekomendasi/blob/main/content_based_filtering_magang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Contoh Content Based Filtering : Rekomendasi Modul Pembelajaran Magang**

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random

df = pd.read_csv("https://raw.githubusercontent.com/robbieadam/sistemrekomendasi/main/dataset/program_magang.csv")
df.head()

Unnamed: 0,mitra,alamat,modul
0,Robbie,Banyumanik Semarang Jawa Tengah,Text Mining Machine Learning Basis Data ...
1,PT. IlmuKomputerCom Braindevs Sistema,"Menara Bidakara 1 Lantai 2,\r\nJl. Gatot Subr...",Software Engineering Time Management Commu...
2,PT Mitra Integrasi Informatika,"APL Tower Lantai 37, Jl. Letjen S. Parman Kav....",Documentation Skills Coding Analytic Skill...
3,PT Semesta Integrasi Digital,"Jl. Jeruk Purut Dalam No.33, RT.6/RW.3, \r\nCi...",Software Engineering Software Testing Rise...
4,PT Telkom Indonesia (Persero) Tbk,"Jl Japati 1, Bandung",Software Development Component Integration ...


**1. Ikhtisar**

In [3]:
df.describe()

Unnamed: 0,mitra,alamat,modul
count,11,11,11
unique,10,11,11
top,PT Mitra Integrasi Informatika,Banyumanik Semarang Jawa Tengah,Text Mining Machine Learning Basis Data ...
freq,2,1,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   mitra   11 non-null     object
 1   alamat  11 non-null     object
 2   modul   11 non-null     object
dtypes: object(3)
memory usage: 392.0+ bytes


**2. Tampilan Modul Pembelajaran (Sebelum Preprocessing)**

In [5]:
def print_description(index):
    example = df[df.index == index][['modul', 'mitra','alamat']].values[0]
    if len(example) > 0:
        print(example[0])
        print('mitra:', example[1])
        print('alamat:', example[2])

In [6]:
print_description(1)

Software Engineering   Time Management   Communication Skills   Team Work   System Analysis dan Design
mitra: PT. IlmuKomputerCom Braindevs Sistema
alamat: Menara Bidakara 1  Lantai 2,
Jl. Gatot Subroto Kav. 71-73 Pancoran
Jakarta Selatan 12870


In [7]:
print_description(3)

Software Engineering   Software Testing   Riset dan Analaisis   Koordinasi   Pengembangan Perangkat
mitra: PT Semesta Integrasi Digital
alamat: Jl. Jeruk Purut Dalam No.33, RT.6/RW.3, 
Cilandak Tim., Kec. Ps. Minggu, 
Kota Jakarta Selatan, 
Daerah Khusus Ibukota Jakarta 12560.


In [8]:
print_description(5)

Quality Control   Database   API   Deployment
mitra: PT Abbauf Mulia Konsultan Teknologi
alamat: APL Tower Central Park 
Lantai 19 Unit T7, Jl. Letjen S. Parman 
No.Kavling 28, RT.12/RW.6, 
Tj. Duren Sel., Kec. Grogol petamburan, 
Kota Jakarta Barat, 
Daerah Khusus Ibukota Jakarta 11470


**3. Text Preprocessing**

In [9]:
import nltk
nltk.download('stopwords')
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
#stopworda = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    #text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi
    return text
    
df['modul_pembelajaran_clean'] = df['modul'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
df.head()

Unnamed: 0,mitra,alamat,modul,modul_pembelajaran_clean
0,Robbie,Banyumanik Semarang Jawa Tengah,Text Mining Machine Learning Basis Data ...,text mining machine learning basis data ...
1,PT. IlmuKomputerCom Braindevs Sistema,"Menara Bidakara 1 Lantai 2,\r\nJl. Gatot Subr...",Software Engineering Time Management Commu...,software engineering time management commu...
2,PT Mitra Integrasi Informatika,"APL Tower Lantai 37, Jl. Letjen S. Parman Kav....",Documentation Skills Coding Analytic Skill...,documentation skills coding analytic skill...
3,PT Semesta Integrasi Digital,"Jl. Jeruk Purut Dalam No.33, RT.6/RW.3, \r\nCi...",Software Engineering Software Testing Rise...,software engineering software testing rise...
4,PT Telkom Indonesia (Persero) Tbk,"Jl Japati 1, Bandung",Software Development Component Integration ...,software development component integration ...


**4. Tampilan Modul Pembelajaran (Setelah Preprocessing)**

In [11]:
# Tampilan kedua (Setelah preprocessing)
def print_description_clean(index):
    example = df[df.index == index][['modul_pembelajaran_clean', 'mitra','alamat']].values[0]
    if len(example) > 0:
        print(example[0])
        print('mitra:', example[1])
        print('alamat:', example[2])

In [12]:
print_description_clean(1)

software engineering   time management   communication skills   team work   system analysis dan design
mitra: PT. IlmuKomputerCom Braindevs Sistema
alamat: Menara Bidakara 1  Lantai 2,
Jl. Gatot Subroto Kav. 71-73 Pancoran
Jakarta Selatan 12870


In [13]:
print_description_clean(2)

documentation skills   coding   analytic skills   communication skills   presentation skills
mitra: PT Mitra Integrasi Informatika
alamat: APL Tower Lantai 37, Jl. Letjen S. Parman Kav. 
28, RT.12/RW.6, Tj. Duren Sel., Jakarta Barat, 
Kota Jakarta Barat, 
Daerah Khusus Ibukota Jakarta 11470


In [14]:
print_description_clean(5)

quality control   database   api   deployment
mitra: PT Abbauf Mulia Konsultan Teknologi
alamat: APL Tower Central Park 
Lantai 19 Unit T7, Jl. Letjen S. Parman 
No.Kavling 28, RT.12/RW.6, 
Tj. Duren Sel., Kec. Grogol petamburan, 
Kota Jakarta Barat, 
Daerah Khusus Ibukota Jakarta 11470


**5. TF-IDF & Cosine Similarity**

In [15]:
df.set_index('mitra', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['modul_pembelajaran_clean'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.        , 0.        , 0.01990878, 0.020046  ,
        0.        , 0.0303574 , 0.02980632, 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.15171958, 0.12016205, 0.07791703,
        0.        , 0.        , 0.        , 0.02577011, 0.        ,
        0.0484071 ],
       [0.        , 0.15171958, 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.01990878, 0.12016205, 0.        , 1.        , 0.0441957 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.05810861],
       [0.020046  , 0.07791703, 0.        , 0.0441957 , 1.        ,
        0.        , 0.        , 0.        , 0.03822683, 0.        ,
        0.05135387],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.06367502, 0.        , 0.        ,
        0.        ],
       [0.0303574 , 0.        , 0.        , 0.        , 0.

In [16]:
# Set index utama di kolom 'mitra'
indices = pd.Series(df.index)
indices[:50]

0                                    Robbie
1     PT. IlmuKomputerCom Braindevs Sistema
2            PT Mitra Integrasi Informatika
3              PT Semesta Integrasi Digital
4         PT Telkom Indonesia (Persero) Tbk
5       PT Abbauf Mulia Konsultan Teknologi
6                PT Orbit Ventura Indonesia
7              PT Nurul Fikri Cipta Inovasi
8       PT MariBelajar Indonesia \r\nCerdas
9            PT Mitra Integrasi Informatika
10         PT. Impactbyte Teknologi Edukasi
Name: mitra, dtype: object

**6. Modelling**

In [17]:
def recommendations(name, cos_sim = cos_sim):
    
    recommended_modul = []
    
    # Mengambil nama modul berdasarkan variabel indicies
    idx = indices[indices == name].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # mengambil index dan dibuat 3 baris rekomendasi terbaik
    top_10_indexes = list(score_series.iloc[1:4].index)
    
    for i in top_10_indexes:
        recommended_modul.append(list(df.index)[i])
        
    return recommended_modul

**7. Prediksi**

In [18]:
recommendations('Robbie')

['PT Orbit Ventura Indonesia',
 'PT Nurul Fikri Cipta Inovasi',
 'PT Telkom Indonesia (Persero) Tbk']

In [19]:
recommendations('PT Telkom Indonesia (Persero) Tbk')

['PT. IlmuKomputerCom Braindevs Sistema',
 'PT. Impactbyte Teknologi Edukasi',
 'PT Semesta Integrasi Digital']

In [20]:
recommendations('PT Nurul Fikri Cipta Inovasi')

['PT Abbauf Mulia Konsultan Teknologi',
 'PT. Impactbyte Teknologi Edukasi',
 'Robbie']