In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Data dari csv

In [None]:
dataset = pd.read_csv('../input/udemy-courses/udemy_courses.csv');

## Check Data

In [None]:
dataset.head()

In [None]:
dataset.info()

## Pembersihan Data

check kolom yang memiliki data null

In [None]:
dataset.isnull().sum()

Mengkonversi format datetime dan membuat kolom baru yaitu time dan year

In [None]:
dates = []
for i in dataset['published_timestamp']:
    datess=dt.datetime.strptime(i, '%Y-%m-%dT%H:%M:%SZ')
    dates.append(datess)

dataset['time'] = dates
dataset['year'] = dataset['time'].dt.year
dataset.head()

menghapus kolom published_timestamp 

In [None]:
dataset.drop('published_timestamp',axis=1,inplace=True)
dataset.head()

## Initial Observation

melihat data yang bisa dijadikan informasi

In [None]:
dataset.describe().transpose()

* Harga course mulai dari 0-200 dolar
* Ada course yang punya 0 subscriber
* Kebanyakan course memiliki 2500 subscriber
* Jumlah maksimal review 10 kali lebih kecil dari jumlah subscriber
* Ada course yang punya durasi 0 jam

# Content Based Filtering

# Cara 1

## Content : Judul

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Mendefinisikan TF-IDF Vectorizer Object. Menghapus semua english stop words seperti 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Membentuk matriks TF-IDF yang dibutuhkan dengan mencocokkan dan mengubah data
tfidf_matrix = tfidf.fit_transform(dataset['course_title'])

#Output dari tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Menghitung cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Membuat reverse map dari indeks dan course titles
indices = pd.Series(dataset.index, index=dataset['course_title']).drop_duplicates()
indices

In [None]:
# Fungsi yang mengambil course title sebagai inputan dan output course yang memiliki beberapa kemiripan
def get_recommendations(title='none', cosine_sim='none'):
    # Mengambil indeks dari course yang diinput
    idx = indices[title]

    # Mengambil skor kemiripan dari semua course dengan course inputan
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Mengurutkan course berdasarkan skor kemiripannya
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    

    # Mengambil 10 course yang memiliki skor kemiripan tertinggi
    sim_scores = sim_scores[1:6]

    # Mengambil indeks course
    course_indices = [i[0] for i in sim_scores]

    # Me-return top 10 course yang memiliki kemiripan tertinggi
    return dataset['course_title'].iloc[course_indices]

Mengimplementasikan fungsi rekomendasi diatas

In [None]:
get_recommendations('Ultimate Investment Banking Course',cosine_sim)

### Kesimpulan : Hasil rekomendasi yang diberikan masih kurang pas (karena course yang direkomendasikan ada yang tidak memiliki keterkaitan)

# Cara 2 

## Content : Judul, level dan subject (Hasil Rekomendasi sudah pas)

Fungsi untuk menghilangkan spasi pada fitur

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

Implementasi fungsi diatas untuk membersihkan fitur url, level dan subject

In [None]:
# Apply clean_data function to your features.
features = ['url','level', 'subject']

for feature in features:
    dataset[feature] = dataset[feature].apply(clean_data)

Fungsi untuk menggabungkan isi dari course_title, level dan subject yang dijadikan metadata dan nantinya dipakai untuk perhitungan kemiripannya

In [None]:
def create_soup(x):
    return ''.join(x['course_title']) + ' '  + ''.join(x['level']) + ' '  + ''.join(x['subject'])
dataset['soup'] = dataset.apply(create_soup, axis=1)
dataset.head(10)

In [None]:
# Import CountVectorizer dan membuat count matrix
from sklearn.feature_extraction.text import CountVectorizer

# count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(dataset['soup'])
count_matrix

# #Mendefinisikan TF-IDF Vectorizer Object. Menghapus semua english stop words seperti 'the', 'a'
# tfidf = TfidfVectorizer(stop_words='english')

# #Membentuk matriks TF-IDF yang dibutuhkan dengan mencocokkan dan mengubah data
# tfidf_matrix2 = tfidf.fit_transform(dataset['soup'])

# #Output dari tfidf_matrix
# tfidf_matrix2.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# menghitung cosine similarity
cosine_sim2 = cosine_similarity(count_matrix)
cosine_sim2

In [None]:
# Reset index dari DataFrame awal dan membuat reverse mapping
dataset2 = dataset.reset_index()
indices = pd.Series(dataset2.index, index=dataset2['course_title'])

## Hasil Rekomendasi

In [None]:
rec = get_recommendations('Ultimate Investment Banking Course', cosine_sim2)
rec

In [None]:
rec = get_recommendations('The Ultimate Web Development Course', cosine_sim2)
rec