# Dev Notebook
A place to develop and test code.

# Content Recommender

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances

In [None]:
#TODO every model should have a fit and a predict. <3

In [None]:
df = pd.DataFrame([np.random.rand(1, 20)[0], np.random.rand(1, 20)[0]]).T
df.columns = ["f1", "f2"]
df.head()

In [None]:
class ContentRecommender(object):

    def get_similar_recommendations(self, seed_item, feature_matrix, similarity_metric, n):
        ''' Return top n similar items to a seed item '''
        
        if similarity_metric not in ["cosine", "euclidean", "manhattan", "jaccard"]:
            return ValueError("similarity_metric must be cosine, euclidean, manhattan, or jaccard")

        item_vector = np.array(feature_matrix.loc[seed_item]).reshape(1, -1)

        similarities = self._choose_similarity(item_vector, feature_matrix, similarity_metric)

        similar_items, scores = self._get_top_items(similarities, n)

        return {"similar_items":similar_items, "score":np.round(scores,5)}
    
    @staticmethod
    def _choose_similarity(item_vector, feature_matrix, similarity_metric):
        '''calculate similarity scores based on specified metric.'''
        if similarity_metric == "cosine":
            similarities =  1-pairwise_distances(X=feature_matrix, Y=item_vector, metric="cosine")
        elif similarity_metric == "euclidean":
            similarities = 1-pairwise_distances(X=feature_matrix, Y=item_vector, metric="euclidean")
        elif similarity_metric == "manhattan":
            similarities = 1 - pairwise_distances(X=feature_matrix, Y=item_vector, metric="manhattan")
        elif similarity_metric == "jaccard":
            similarities = 1 - pairwise_distances(X=feature_matrix, Y=item_vector, metric="hamming")
            
        similarities = pd.DataFrame(similarities, index = feature_matrix.index.tolist())
        similarities.columns = ['similarity_score']
        similarities.sort_values('similarity_score', ascending=False, inplace=True)
        
        return similarities
    
    @staticmethod
    def _get_top_items(similarities, n):
        '''return top n similar items with similarity scores'''
        similar_items = similarities.head(n).index.values.tolist()
        scores = similarities.head(n).similarity_score.values.tolist()       
        return similar_items, scores

In [None]:
CF = ContentRecommender()
CF.get_similar_recommendations(seed_item=0, feature_matrix=df, similarity_metric="manhattan", n=10)

# Feature Engineering

In [1]:
from src.feature_engineering import Embeddings, DataCleaning
import pandas as pd

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:852)>


In [2]:
df = pd.DataFrame(["this is a example about cats", 
                   "cats are super cute", 
                   "Cats are on sale today! Get one", 
                   "Kittens are the best cats", 
                   "I like cats better than dogs", 
                   "smart cats can type emails", 
                   "cats normally don't like going for walks.", 
                   "What makes cats smart?", 
                   "are all cats cute?", 
                   "dogs are kinda smart though"])
df.columns = ["document"]
df.head()

Unnamed: 0,document
0,this is a example about cats
1,cats are super cute
2,Cats are on sale today! Get one
3,Kittens are the best cats
4,I like cats better than dogs


In [3]:
tfidf_matrix, _ = Embeddings.tfidf_vectorizer(df["document"])
tfidf_matrix

Unnamed: 0,best,better,cats,cute,dogs,don,emails,example,going,kinda,kittens,like,makes,normally,sale,smart,super,today,type,walks
0,0.0,0.0,0.375349,0.0,0.0,0.0,0.0,0.926883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.294825,0.6189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728039,0.0,0.0,0.0
2,0.0,0.0,0.275285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.679786,0.0,0.0,0.679786,0.0,0.0
3,0.679786,0.0,0.275285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.679786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.619067,0.250696,0.0,0.526264,0.0,0.0,0.0,0.0,0.0,0.0,0.526264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.245672,0.0,0.0,0.0,0.60666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.451191,0.0,0.0,0.60666,0.0
6,0.0,0.0,0.183191,0.0,0.0,0.452371,0.0,0.0,0.452371,0.0,0.0,0.384556,0.0,0.452371,0.0,0.0,0.0,0.0,0.0,0.452371
7,0.0,0.0,0.309036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.763131,0.0,0.0,0.567563,0.0,0.0,0.0,0.0
8,0.0,0.0,0.430066,0.902797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.563507,0.0,0.0,0.0,0.0,0.662879,0.0,0.0,0.0,0.0,0.0,0.493002,0.0,0.0,0.0,0.0


In [4]:
svd_matrix, _ = Embeddings.svd(tfidf_matrix, 2)
svd_matrix

Unnamed: 0,svd_0,svd_1
0,0.379566,-0.058497
1,0.656197,-0.556709
2,0.294209,-0.047836
3,0.294209,-0.047836
4,0.409505,0.333906
5,0.447213,0.418655
6,0.278444,0.094584
7,0.504399,0.437487
8,0.711992,-0.523264
9,0.338667,0.641496


In [5]:
pca_matrix, _ = Embeddings.pca(tfidf_matrix, 2)
pca_matrix

Unnamed: 0,pca_0,pca_1
0,0.096902,0.14564
1,0.681662,-0.220629
2,0.05784,0.19054
3,0.05784,0.19054
4,-0.272894,0.418785
5,-0.315354,-0.452977
6,-0.088226,0.61069
7,-0.318218,-0.45101
8,0.659966,-0.201267
9,-0.559518,-0.230313


In [6]:
lsa_matrix, _, _ = Embeddings.lsa(df["document"], n_components=2)
lsa_matrix

Unnamed: 0,lsa_0,lsa_1
0,0.379566,-0.058497
1,0.656197,-0.556709
2,0.294209,-0.047836
3,0.294209,-0.047836
4,0.409505,0.333906
5,0.447213,0.418655
6,0.278444,0.094584
7,0.504399,0.437487
8,0.711992,-0.523264
9,0.338667,0.641496


In [7]:
document = df.document
docvec_matrix, _ = Embeddings.doc_to_vec(document, 5)
docvec_matrix

Unnamed: 0,docvec_0,docvec_1,docvec_2,docvec_3,docvec_4
0,-0.084794,-0.013677,0.066841,0.065322,0.009701
1,-0.051309,-0.04396,0.072702,-0.000876,-0.05771
2,-0.080234,-0.090913,0.055669,0.030625,0.021495
3,0.033709,-0.009511,0.026093,-0.037506,-0.025555
4,0.049412,0.081443,0.072305,-0.090595,0.068181
5,-0.012371,0.076814,-0.027114,0.070367,-0.097808
6,0.040157,-0.067458,0.099057,0.078774,-0.086777
7,-0.055752,0.016654,-0.042962,-0.022965,0.029825
8,0.078932,0.043092,0.058434,-0.085492,0.000562
9,0.025721,-0.005131,0.087664,0.095174,0.038484


In [8]:
lda_matrix, _ = Embeddings.lda(df.document, ntopics=5, n=2)
lda_matrix

Unnamed: 0,lda_0,lda_1,lda_2,lda_3,lda_4
0,0.9332,0.0167,0.0167,0.0167,0.0167
1,0.474,0.0251,0.4506,0.0252,0.0251
2,0.0143,0.0143,0.0143,0.9428,0.0143
3,0.0202,0.0201,0.0202,0.9194,0.0201
4,0.0167,0.9331,0.0167,0.0167,0.0167
5,0.9198,0.02,0.0201,0.02,0.02
6,0.0146,0.9425,0.0143,0.0143,0.0143
7,0.0252,0.0251,0.0251,0.0251,0.8995
8,0.0252,0.0251,0.8995,0.0251,0.0251
9,0.0201,0.0201,0.9198,0.02,0.02
