##### BUILDING CONTENT BASED RECOMMENDER SYSTEM USING K-NEAREST NEIGHBOR AND NAIVE BAYES CLASSIFIER FOR THE CUSTOM 
##### DATASET.

In [1]:
### IMPORTING THE LIBRARIES
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
### DEFINING THE DATA MANUALLY
data = {
    'DRUMS': [1,1,0,0,0,0,0,1],
    'GUITAR': [1,1,1,0,1,0,0,0],
    'BEAT': [1,0,1,0,0,0,0,1],
    'CLASSICAL': [0,0,0,1,1,1,1,0],
    'SYMPHONY': [0,0,0,1,0,1,0,0],
    'ORCHESTRA': [0,1,0,1,1,0,0,0],
    'RATING': ['DISLIKE', 'DISLIKE', 'DISLIKE', 'LIKE', 'LIKE', 'LIKE', np.nan, np.nan]
}

df = pd.DataFrame(data, index =[1, 2, 3, 4, 5, 6, 7, 8])
df.index.name = 'SONG_ID'
df


Unnamed: 0_level_0,DRUMS,GUITAR,BEAT,CLASSICAL,SYMPHONY,ORCHESTRA,RATING
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,1,1,0,0,0,DISLIKE
2,1,1,0,0,0,1,DISLIKE
3,0,1,1,0,0,0,DISLIKE
4,0,0,0,1,1,1,LIKE
5,0,1,0,1,0,1,LIKE
6,0,0,0,1,1,0,LIKE
7,0,0,0,1,0,0,
8,1,0,1,0,0,0,


In [3]:
### FEATURE SCALING OR PERFORMING NORMALISED FREQUENCY [f(x) = log(x)] FOR TF MATRIX
df_scaled = df.copy()
for column in df_scaled.columns[:-1]:  # Exclude the 'RATING' column
    df_scaled[column] = np.log(df_scaled[column] + 1)
df_scaled

Unnamed: 0_level_0,DRUMS,GUITAR,BEAT,CLASSICAL,SYMPHONY,ORCHESTRA,RATING
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.693147,0.693147,0.693147,0.0,0.0,0.0,DISLIKE
2,0.693147,0.693147,0.0,0.0,0.0,0.693147,DISLIKE
3,0.0,0.693147,0.693147,0.0,0.0,0.0,DISLIKE
4,0.0,0.0,0.0,0.693147,0.693147,0.693147,LIKE
5,0.0,0.693147,0.0,0.693147,0.0,0.693147,LIKE
6,0.0,0.0,0.0,0.693147,0.693147,0.0,LIKE
7,0.0,0.0,0.0,0.693147,0.0,0.0,
8,0.693147,0.0,0.693147,0.0,0.0,0.0,


In [4]:
#### COMPUTING THE IDF VALUES [id_i(x) = log(n/n_i)] FOR EACH KEYWORD OR FEATURES
n = 6
n_drums = 3
idf_drums = np.log((n/n_drums) + 1)
print("IDF VALUE FOR DRUMS:", idf_drums)

n_guitar = 4
idf_guitar = np.log((n/n_guitar) + 1)
print("\nIDF VALUE FOR GUITAR:", idf_guitar)

n_beat = 3
idf_beat = np.log((n/n_beat) + 1)
print("\nIDF VALUE FOR BEAT:", idf_beat)

n_classical = 4
idf_classical = np.log((n/n_classical) + 1)
print("\nIDF VALUE FOR CLASSICAL:", idf_classical)

n_symphony = 2
idf_symphony = np.log((n/n_symphony) + 1)
print("\nIDF VALUE FOR SYMPHONY:", idf_symphony)

n_orch = 3
idf_orch = np.log((n/n_orch) + 1)
print("\nIDF VALUE FOR ORCHESTRA:", idf_orch)

idf_values = np.array([1.0986122886681098, 0.9162907318741551,  1.0986122886681098, 0.9162907318741551,  1.3862943611198906, 1.0986122886681098])
print("\nIDF VALUES ARE:", idf_values)

IDF VALUE FOR DRUMS: 1.0986122886681098

IDF VALUE FOR GUITAR: 0.9162907318741551

IDF VALUE FOR BEAT: 1.0986122886681098

IDF VALUE FOR CLASSICAL: 0.9162907318741551

IDF VALUE FOR SYMPHONY: 1.3862943611198906

IDF VALUE FOR ORCHESTRA: 1.0986122886681098

IDF VALUES ARE: [1.09861229 0.91629073 1.09861229 0.91629073 1.38629436 1.09861229]


In [5]:
#### INCLUDING ONLY THE FEATURE FOR PREPROCESSING
X = df_scaled.drop(columns=['RATING'])
print(X.shape)  
print(len(idf_values))

(8, 6)
6


In [6]:
#### OBTAINING THE FULLY PREPROCESSED TF-IDF MATRIX
tf_idf = X * idf_values
tf_idf

Unnamed: 0_level_0,DRUMS,GUITAR,BEAT,CLASSICAL,SYMPHONY,ORCHESTRA
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.7615,0.635124,0.7615,0.0,0.0,0.0
2,0.7615,0.635124,0.0,0.0,0.0,0.7615
3,0.0,0.635124,0.7615,0.0,0.0,0.0
4,0.0,0.0,0.0,0.635124,0.960906,0.7615
5,0.0,0.635124,0.0,0.635124,0.0,0.7615
6,0.0,0.0,0.0,0.635124,0.960906,0.0
7,0.0,0.0,0.0,0.635124,0.0,0.0
8,0.7615,0.0,0.7615,0.0,0.0,0.0


In [7]:
#### CONCATENATING THE RATING COLUMN TO THE SCALED TF-IDF MATRIX
target_df = df_scaled['RATING']
final_tfidf = pd.concat([tf_idf, target_df], axis=1)
final_tfidf


Unnamed: 0_level_0,DRUMS,GUITAR,BEAT,CLASSICAL,SYMPHONY,ORCHESTRA,RATING
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.7615,0.635124,0.7615,0.0,0.0,0.0,DISLIKE
2,0.7615,0.635124,0.0,0.0,0.0,0.7615,DISLIKE
3,0.0,0.635124,0.7615,0.0,0.0,0.0,DISLIKE
4,0.0,0.0,0.0,0.635124,0.960906,0.7615,LIKE
5,0.0,0.635124,0.0,0.635124,0.0,0.7615,LIKE
6,0.0,0.0,0.0,0.635124,0.960906,0.0,LIKE
7,0.0,0.0,0.0,0.635124,0.0,0.0,
8,0.7615,0.0,0.7615,0.0,0.0,0.0,


In [8]:
### NOW, FOR TRAINING PURPOSE, WE NEED TO CONSIDER ONLY THOSE ROWS WHICH ARE COMPLETE
df_full = final_tfidf.dropna()
df_full

Unnamed: 0_level_0,DRUMS,GUITAR,BEAT,CLASSICAL,SYMPHONY,ORCHESTRA,RATING
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.7615,0.635124,0.7615,0.0,0.0,0.0,DISLIKE
2,0.7615,0.635124,0.0,0.0,0.0,0.7615,DISLIKE
3,0.0,0.635124,0.7615,0.0,0.0,0.0,DISLIKE
4,0.0,0.0,0.0,0.635124,0.960906,0.7615,LIKE
5,0.0,0.635124,0.0,0.635124,0.0,0.7615,LIKE
6,0.0,0.0,0.0,0.635124,0.960906,0.0,LIKE


In [9]:
### PREPROCESSING THAT IS, ENCODING THE TARGET COLUMN
label_encoder = LabelEncoder()
df_full['RATING'] = label_encoder.fit_transform(df_full['RATING'])
df_full

Unnamed: 0_level_0,DRUMS,GUITAR,BEAT,CLASSICAL,SYMPHONY,ORCHESTRA,RATING
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.7615,0.635124,0.7615,0.0,0.0,0.0,0
2,0.7615,0.635124,0.0,0.0,0.0,0.7615,0
3,0.0,0.635124,0.7615,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.635124,0.960906,0.7615,1
5,0.0,0.635124,0.0,0.635124,0.0,0.7615,1
6,0.0,0.0,0.0,0.635124,0.960906,0.0,1


In [10]:
### SEPARATING THE FEATURES AND TARGET COLUMN
X = df_full.iloc[:, 0:6]
y = df_full.iloc[:, 6]


In [11]:
### SONG_ID 7 AND 8 ARE THE TEST OBEJCTS IN THIS DATA WHOSE RATINGS WE NEED TO PREDICT BASED ON THE DISTANCE CALCULATION
### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 1 AND TEST OBJECT 7
test_7 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]])
tr_1 = np.array([[0.7615,	0.635124,	0.7615,	0.000000,	0.000000,	0.0000]])
squared_diff = (test_7 - tr_1)**2
sum_squared_diff = np.sum(squared_diff)
distance_7_1 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 1:", distance_7_1)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 2 AND TEST OBJECT 7
test_7 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]])
tr_2 = np.array([[0.7615,	0.635124,	0.0000,	0.000000,	0.000000,	0.7615	]])
squared_diff = (test_7 - tr_2)**2
sum_squared_diff = np.sum(squared_diff)
distance_7_2 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 2:", distance_7_2)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 3 AND TEST OBJECT 7
test_7 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]])
tr_3 = np.array([[0.0000,	0.635124,	0.7615,	0.000000,	0.000000,	0.0000]])
squared_diff = (test_7 - tr_3)**2
sum_squared_diff = np.sum(squared_diff)
distance_7_3 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 3:", distance_7_3)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 4 AND TEST OBJECT 7
test_7 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]])
tr_4 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.960906,	0.7615]])
squared_diff = (test_7 - tr_4)**2
sum_squared_diff = np.sum(squared_diff)
distance_7_4 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 4:", distance_7_4)


### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 5 AND TEST OBJECT 7
test_7 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]])
tr_5 = np.array([[0.0000,	0.635124,	0.0000,	0.635124,	0.000000,	0.7615]])
squared_diff = (test_7 - tr_5)**2
sum_squared_diff = np.sum(squared_diff)
distance_7_5 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 5:", distance_7_5)


### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 6 AND TEST OBJECT 7
test_7 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]])
tr_6 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.960906,	0.0000]])
squared_diff = (test_7 - tr_6)**2
sum_squared_diff = np.sum(squared_diff)
distance_7_6 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 6:", distance_7_6)

print("\nTHE SMALLEST DISTANCE HAPPENS TO BE BETWEEN TEST OBJECT 7 AND TRAIN OBJECT 6")
print("\nTHE ESTIMATED RATING FOR TEST OBJECT (SONG_ID) 7 GIVEN BY THE USER IS LIKE")
print("\nTHE SYSTEM RECOMMENDS THE SONG WITH ID 7 TO THE TARGET USER")

DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 1: 1.4023300220532968
DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 2: 1.4023300220532968
DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 3: 1.1775598671626
DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 4: 1.226059782733289
DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 5: 0.991597068055367
DISTANCE BETWEEN TEST OBJECT 7 AND TRAINING OBJECT 6: 0.960906

THE SMALLEST DISTANCE HAPPENS TO BE BETWEEN TEST OBJECT 7 AND TRAIN OBJECT 6

THE ESTIMATED RATING FOR TEST OBJECT (SONG_ID) 7 GIVEN BY THE USER IS LIKE

THE SYSTEM RECOMMENDS THE SONG WITH ID 7 TO THE TARGET USER


In [12]:
### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 1 AND TEST OBJECT 8
test_8 = np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]])
tr_1 = np.array([[0.7615,	0.635124,	0.7615,	0.000000,	0.000000,	0.0000]])
squared_diff = (test_8 - tr_1)**2
sum_squared_diff = np.sum(squared_diff)
distance_8_1 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 1:", distance_8_1)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 2 AND TEST OBJECT 7
test_8 = np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]])
tr_2 = np.array([[0.7615,	0.635124,	0.0000,	0.000000,	0.000000,	0.7615	]])
squared_diff = (test_8 - tr_2)**2
sum_squared_diff = np.sum(squared_diff)
distance_8_2 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 2:", distance_8_2)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 3 AND TEST OBJECT 8
test_8 = np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]])
tr_3 = np.array([[0.0000,	0.635124,	0.7615,	0.000000,	0.000000,	0.0000]])
squared_diff = (test_8 - tr_3)**2
sum_squared_diff = np.sum(squared_diff)
distance_8_3 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 3:", distance_8_3)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 4 AND TEST OBJECT 8
test_8 = np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]])
tr_4 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.960906,	0.7615]])
squared_diff = (test_8 - tr_4)**2
sum_squared_diff = np.sum(squared_diff)
distance_8_4 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 4:", distance_8_4)


### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 5 AND TEST OBJECT 8
test_8 = np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]])
tr_5 = np.array([[0.0000,	0.635124,	0.0000,	0.635124,	0.000000,	0.7615]])
squared_diff = (test_8 - tr_5)**2
sum_squared_diff = np.sum(squared_diff)
distance_8_5 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 5:", distance_8_5)

### COMPUTING THE DISTANCE BETWEEN TRAINING OBJECT 6 AND TEST OBJECT 8
test_8 = np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]])
tr_6 = np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.960906,	0.0000]])
squared_diff = (test_8 - tr_6)**2
sum_squared_diff = np.sum(squared_diff)
distance_8_6 = np.sqrt(sum_squared_diff)
print("DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 6:", distance_8_6)

print("\nTHE SMALLEST DISTANCE HAPPENS TO BE BETWEEN TEST OBJECT 8 AND TRAIN OBJECT 1 ")
print("\nTHE ESTIMATED RATING FOR TEST OBJECT (SONG_ID) 8 GIVEN BY THE USER IS DISLIKE")
print("\nTHE SYSTEM  DOES NOT RECOMMEND THE SONG WITH ID 8 TO THE TARGET USER")


DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 1: 0.635124
DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 2: 1.2502587713653521
DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 3: 0.991597068055367
DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 4: 1.75110524704028
DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 5: 1.5957480191909998
DISTANCE BETWEEN TEST OBJECT 8 AND TRAINING OBJECT 6: 1.5768599608754101

THE SMALLEST DISTANCE HAPPENS TO BE BETWEEN TEST OBJECT 8 AND TRAIN OBJECT 1 

THE ESTIMATED RATING FOR TEST OBJECT (SONG_ID) 8 GIVEN BY THE USER IS DISLIKE

THE SYSTEM  DOES NOT RECOMMEND THE SONG WITH ID 8 TO THE TARGET USER


#####  BUILDING KNN MODEL TO COMPARE WITH THE ABOVE COMPUTATION RESULT

In [13]:
#### BUILDING AND FITTING THE KNN MODEL
knn = KNeighborsClassifier()
knn.fit(X,y)

#### INLINE PREDICTION
tr_pred = knn.predict(X)
acc_knn = accuracy_score(y, tr_pred)
print("TRAINED ACCURACY IS:", acc_knn)
#### OUTLINE PREDICTION
test_7_pred = knn.predict(np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]]))
print("\nTHE ESTIMATED RATING FOR SONG_ID 7 GIVEN BY THE USER IS:", test_7_pred[0])
print("ACCORDING KNN MODEL, THE SYSTEM RECOMMENDS AS THE RATING IS LIKE")
test_8_pred = knn.predict(np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]]))
print("\nTHE ESTIMATED RATING FOR SONG_ID 8 GIVEN BY THE USER IS:", test_8_pred[0])
print("ACCORDING KNN MODEL, THE SYSTEM DOES NOT RECOMMEND AS THE RATING IS DISLIKE")


TRAINED ACCURACY IS: 1.0

THE ESTIMATED RATING FOR SONG_ID 7 GIVEN BY THE USER IS: 1
ACCORDING KNN MODEL, THE SYSTEM RECOMMENDS AS THE RATING IS LIKE

THE ESTIMATED RATING FOR SONG_ID 8 GIVEN BY THE USER IS: 0
ACCORDING KNN MODEL, THE SYSTEM DOES NOT RECOMMEND AS THE RATING IS DISLIKE


##### THEREFORE, WE CAN STATE THAT KNN RESULT IS EXACTLY SAME AS WHAT IT WAS FOR DISTANCE COMPUTATION.


##### BUILDING NAIVE BAYES CLASSIFIER MODEL TO COMPARE WITH THE ABOVE DISTANCE COMPUTATION RESULT

In [14]:
#### BUILDING AND FITTING THE NAIVE BAYES CLASSIFIER MODEL
naive = GaussianNB()
naive.fit(X,y)

#### INLINE PREDICTION
train_pred = naive.predict(X)
acc_naive = accuracy_score(y, train_pred)
print("TRAINED ACCURACY IS:", acc_naive)
#### OUTLINE PREDICTION
test_7_pred_naive = naive.predict(np.array([[0.0000,	0.000000,	0.0000,	0.635124,	0.000000,	0.0000]]))
print("\nTHE ESTIMATED RATING FOR SONG_ID 7 GIVEN BY THE USER IS:", test_7_pred_naive[0])
print("ACCORDING KNN MODEL, THE SYSTEM RECOMMENDS AS THE RATING IS LIKE")
test_8_pred_naive = naive.predict(np.array([[0.7615,	0.000000,	0.7615,	0.000000,	0.000000,	0.0000]]))
print("\nTHE ESTIMATED RATING FOR SONG_ID 8 GIVEN BY THE USER IS:", test_8_pred_naive[0])
print("ACCORDING KNN MODEL, THE SYSTEM DOES NOT RECOMMEND AS THE RATING IS DISLIKE")


TRAINED ACCURACY IS: 1.0

THE ESTIMATED RATING FOR SONG_ID 7 GIVEN BY THE USER IS: 1
ACCORDING KNN MODEL, THE SYSTEM RECOMMENDS AS THE RATING IS LIKE

THE ESTIMATED RATING FOR SONG_ID 8 GIVEN BY THE USER IS: 0
ACCORDING KNN MODEL, THE SYSTEM DOES NOT RECOMMEND AS THE RATING IS DISLIKE


##### THEREFORE, WE CAN STATE THAT NAIVE BAYES CLASSIFIER RESULT IS EXACTLY THE SAME WITH WHAT IT WAS FOR DISTANCE COMPUTATION.
