# Content Recommender

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances

In [None]:
df = pd.DataFrame([np.random.rand(1, 20)[0], np.random.rand(1, 20)[0]]).T
df.columns = ["f1", "f2"]
df.head()

In [None]:
class ContentRecommender(object):

    def get_similar_recommendations(self, seed_item, feature_matrix, similarity_metric, n):
        ''' Return top n similar items to a seed item '''
        
        if similarity_metric not in ["cosine", "euclidean", "manhattan", "jaccard"]:
            return ValueError("similarity_metric must be cosine, euclidean, manhattan, or jaccard")

        item_vector = np.array(feature_matrix.loc[seed_item]).reshape(1, -1)

        similarities = self._choose_similarity(item_vector, feature_matrix, similarity_metric)

        similar_items, scores = self._get_top_items(similarities, n)

        return {"similar_items":similar_items, "score":np.round(scores,5)}
    
    @staticmethod
    def _choose_similarity(item_vector, feature_matrix, similarity_metric):
        '''calculate similarity scores based on specified metric.'''
        if similarity_metric == "cosine":
            similarities =  1-pairwise_distances(X=feature_matrix, Y=item_vector, metric="cosine")
        elif similarity_metric == "euclidean":
            similarities = 1-pairwise_distances(X=feature_matrix, Y=item_vector, metric="euclidean")
        elif similarity_metric == "manhattan":
            similarities = 1 - pairwise_distances(X=feature_matrix, Y=item_vector, metric="manhattan")
        elif similarity_metric == "jaccard":
            similarities = 1 - pairwise_distances(X=feature_matrix, Y=item_vector, metric="hamming")
            
        similarities = pd.DataFrame(similarities, index = feature_matrix.index.tolist())
        similarities.columns = ['similarity_score']
        similarities.sort_values('similarity_score', ascending=False, inplace=True)
        
        return similarities
    
    @staticmethod
    def _get_top_items(similarities, n):
        '''return top n similar items with similarity scores'''
        similar_items = similarities.head(n).index.values.tolist()
        scores = similarities.head(n).similarity_score.values.tolist()       
        return similar_items, scores

In [None]:
CF = ContentRecommender()
CF.get_similar_recommendations(seed_item=0, feature_matrix=df, similarity_metric="manhattan", n=10)

# Feature Engineering

In [None]:
# Embeddings

#doc2vec
#LSA
#TFIDF
#Count Vectorizor

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/clongo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
class FeatureEngineering(object):

    def lsa(self, df):
        ''' Get LDA embeddings from text data. Trains and saves LSA model.'''
        return df
        

    def tfidf(self, df, path_to_models, max_df=1.0, min_df=1, max_features=None, ngram_range=(1, 1)):
        '''Get tfidf matrix from text data. Trains and saves tfidf model.'''
    
        #initialize and fit model, transform input data
        tf = TfidfVectorizer(analyzer='word', 
                             max_df=max_df,
                             min_df=min_df,
                             max_features=max_features,
                             ngram_range=ngram_range,
                             stop_words='english')

        tf.fit(df['document'])
        
        #save trained model for future use
        pickle.dump(tf, open(path_to_models+"/tfidf_model.pkl", "wb"))
        
        #transform and return input data
        tfidf_matrix = tf.transform(df['document'])
        tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray())
        tfidf_matrix.columns = tf.get_feature_names()

        return tfidf_matrix


    def count_vectorizor(self, df):
        '''Get word count matrix from text data, Trains and saves cv model.'''
        return df
        
        
    def pca(self, df):
        '''Perform pca on feature matrix. Can be used for dimensionality reduction, smoothing, or creating plot axes. Trains and saves pca model.'''
        return df
    
    
    def svd(self, df, n_components, path_to_models):
        '''Perform svd on feature matrix. Can be used for dimensionality reduction, smoothing, or creating plot axes. Trains and saves svd model.'''

        #initialize and train svd model
        tsvd = TruncatedSVD(n_components=n_components)
        tsvd = tsvd.fit(df)

        #save trained model for future use
        pickle.dump(tsvd, open(path_to_models+"/svd_model.pkl", "wb"))

        #transform and return input data
        latent_matrix = tsvd.transform(df)
        latent_matrix = pd.DataFrame(latent_matrix)
        latent_matrix.columns = ["svd_"+str(s) for s in np.arange(0,n_components)]

        return latent_matrix
    
        
    def doc2vec(self, df):
        '''Use doc2vec to create document embeddings'''
        return df
    
    
    def lda(self, df):
        '''Use LDA to create document embeddings'''
        return ds


    def ohe_features(self, df, feature, frequency_threshold):
        '''
        One-hot-encode a categorical feature into binary columns.
        df: pandas data frame with feature to be encoded
        feature: str. feature column name
        frequency_threshold: number of occurrences to threshold feature at.
        '''
        vc = df[feature].value_counts()
        keep_values = vc[vc > frequency_threshold].index.tolist()
        ohe_feature = pd.get_dummies(df[feature])

        feature_names = ohe_feature.columns
        keep_features = feature_names[feature_names.isin(keep_values)]

        return ohe_feature[keep_features]

        
    def stem_words(self, text):
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        return text

    
    def make_lower_case(self, text):
        return text.lower()

    
    def remove_stop_words(self, text):
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
        return text

    
    def remove_punctuation(self, text):
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)
        text = " ".join(text)
        return text
    
    
    def remove_emails(self, text):
        string_no_emails = re.sub("\S*@\S*\s?", "", text)
        return (string_no_emails)

    
    def remove_numbers(self, text):
        string_no_numbers = re.sub("\d+", "", text)
        return (string_no_numbers)

In [3]:
FE = FeatureEngineering()

In [4]:
df = pd.DataFrame(["this is a example about cats", "cats are super cute", "Cats are on sale today! Get one", "Kittens are the best cats", "I like cats better than dogs", "smart cats can type", "cats normally don't like going for walks.", "What makes cats smart?", "are all cats cute?", "dogs are kinda smart though"])
df.columns = ["document"]
df.head()

Unnamed: 0,document
0,this is a example about cats
1,cats are super cute
2,Cats are on sale today! Get one
3,Kittens are the best cats
4,I like cats better than dogs


In [5]:
tfidf_matrix = FE.tfidf(df, "test", min_df=2)
tfidf_matrix

Unnamed: 0,cats,cute,dogs,like,smart
0,1.0,0.0,0.0,0.0,0.0
1,0.430066,0.902797,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.319221,0.0,0.670111,0.670111,0.0
5,0.478204,0.0,0.0,0.0,0.878249
6,0.430066,0.0,0.0,0.902797,0.0
7,0.478204,0.0,0.0,0.0,0.878249
8,0.430066,0.902797,0.0,0.0,0.0
9,0.0,0.0,0.752621,0.0,0.658454


In [6]:
FE.svd(tfidf_matrix, 2, "test")

Unnamed: 0,svd_0,svd_1
0,0.90457,-0.206629
1,0.59389,-0.485283
2,0.90457,-0.206629
3,0.90457,-0.206629
4,0.479452,0.231855
5,0.693109,0.590184
6,0.544282,-0.031396
7,0.693109,0.590184
8,0.59389,-0.485283
9,0.280079,0.80314
