In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [113]:
import pandas as pd
import numpy as np
import random
import scipy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

pd.set_option('max_colwidth', -1)

  # Remove the CWD from sys.path while we load stuff.


## Dataset Generation

In [116]:
path = "/content/drive/MyDrive/HACKATON/RECOMENDATION_SYSTEM/"

classes = ["puzzles", "tennis", "running", "reading", "football", "yoga", "fishing",
            "chess", "cooking", "swimming", "tv", "travel", "baseball", "hockey"]

id = []
keywords = []
for i in range(100):
    id.append(i)
    keywords.append(random.sample(classes, random.choice(range(3,10))))

data = {"id":id,
        "keywords":keywords}
    
## create a dataframe
df = pd.DataFrame(data)

## preprocessing column
df["keywords"] = df["keywords"].apply(lambda x: " ".join(x))

print(df.shape)
df.to_csv(path + "keywords_ids.csv", index=False)
df.head()

(100, 2)


Unnamed: 0,id,keywords
0,0,puzzles swimming hockey tennis reading yoga baseball chess running
1,1,travel hockey running tennis
2,2,puzzles tennis reading hockey yoga travel
3,3,hockey yoga chess reading puzzles swimming
4,4,swimming hockey running yoga reading tennis baseball


## Train Term - Frecuency Vectorizer

In [43]:
## train TF
count_vectorizer = CountVectorizer(stop_words="english")
tf_matrix = count_vectorizer.fit_transform(df["keywords"])
tf_matrix.shape

(100, 14)

In [44]:
## show vocabulary
print(count_vectorizer.vocabulary_)
len(count_vectorizer.vocabulary_)

{'yoga': 13, 'travel': 11, 'cooking': 2, 'baseball': 0, 'hockey': 5, 'tennis': 10, 'chess': 1, 'running': 8, 'swimming': 9, 'football': 4, 'reading': 7, 'tv': 12, 'fishing': 3, 'puzzles': 6}


14

In [45]:
## show vocabulary
pd.DataFrame(tf_matrix.toarray(),
             columns = count_vectorizer.get_feature_names()).head()

Unnamed: 0,baseball,chess,cooking,fishing,football,hockey,puzzles,reading,running,swimming,tennis,travel,tv,yoga
0,1,1,1,0,0,1,0,0,1,0,1,1,0,1
1,0,1,0,0,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,1,0,0,0,0,1,0,1,0,0
3,0,0,1,1,0,1,0,1,0,1,1,1,1,0
4,0,1,1,0,1,1,0,0,0,0,0,0,0,0


## Get Cosine Similarities

In [80]:
def cosine_similarities(x, count_vectorizer, tf_matrix, dataset):
    ## get the vector of x
    x = np.array([x], dtype=object)
    x = count_vectorizer.transform(x)

    ## compute the cosine similarities
    cosine_scores = list(enumerate(cosine_similarity(x, tf_matrix)[0]))

    ## sort scores based cosine similarities
    cosine_scores = sorted(cosine_scores, key=lambda x: x[1], reverse=True)

    ## get scores of the 10 most similar IDs
    cosine_scores = cosine_scores[0:20]

    ## get the user ID indices
    id_indices = [i[0] for i in cosine_scores]

    ## get the user ID names
    id_names = dataset["id"].iloc[id_indices]

    ## get cosines
    cosines = [i[1] for i in cosine_scores]

    return id_names, cosines

In [117]:
## save and load Count Vectorizer
pickle.dump(count_vectorizer, open("/content/drive/MyDrive/HACKATON/RECOMENDATION_SYSTEM/count_vectorizer.pkl", "wb"))
count_vectorizer = pickle.load(open("/content/drive/MyDrive/HACKATON/RECOMENDATION_SYSTEM/count_vectorizer.pkl", "rb"))

## save tf matrix
scipy.sparse.save_npz("/content/drive/MyDrive/HACKATON/RECOMENDATION_SYSTEM/tf_matrix.npz", tf_matrix)
tf_matrix = scipy.sparse.load_npz("/content/drive/MyDrive/HACKATON/RECOMENDATION_SYSTEM/tf_matrix.npz")

## read dataset
df = pd.read_csv("/content/drive/MyDrive/HACKATON/RECOMENDATION_SYSTEM/keywords_ids.csv")
df.head()

Unnamed: 0,id,keywords
0,0,puzzles swimming hockey tennis reading yoga baseball chess running
1,1,travel hockey running tennis
2,2,puzzles tennis reading hockey yoga travel
3,3,hockey yoga chess reading puzzles swimming
4,4,swimming hockey running yoga reading tennis baseball


In [118]:
## get similarities IDs for new input
new_input = "reading chess football yoga"
ids, cosines = cosine_similarities(new_input, count_vectorizer, tf_matrix, df)
ids, cosines

(33    33
 54    54
 39    39
 65    65
 90    90
 13    13
 22    22
 60    60
 74    74
 14    14
 30    30
 43    43
 94    94
 25    25
 50    50
 71    71
 87    87
 4     4 
 6     6 
 23    23
 Name: id, dtype: int64,
 [0.7559289460184544,
  0.7071067811865475,
  0.6666666666666666,
  0.6666666666666666,
  0.6123724356957946,
  0.5773502691896258,
  0.5773502691896258,
  0.5773502691896258,
  0.5773502691896258,
  0.5669467095138407,
  0.5669467095138407,
  0.5669467095138407,
  0.5669467095138407,
  0.5303300858899106,
  0.5303300858899106,
  0.5303300858899106,
  0.5303300858899106,
  0.5,
  0.5,
  0.5])

In [119]:
## show in pandas dataframe
"reading chess football yoga"
df.iloc[pd.Index(df["id"]).get_indexer(ids.values)]

Unnamed: 0,id,keywords
33,33,swimming puzzles fishing
54,54,hockey puzzles reading swimming
39,39,tv running reading chess tennis
65,65,travel puzzles baseball running tennis tv yoga fishing
90,90,chess tv hockey tennis fishing football
13,13,fishing swimming yoga travel
22,22,football fishing tennis travel running
60,60,running hockey baseball football reading
74,74,chess football fishing tennis cooking tv
14,14,puzzles football chess tv reading travel fishing hockey tennis
