In [3]:
#https://www.themarketingtechnologist.co/a-recommendation-system-for-blogs-content-based-similarity-part-2/

In [13]:
from math import*
import re
import logging
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
import seaborn as sns
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [15]:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

In [17]:
class AnalyzeTMTArticles:
    
    def __init__(self):
        #The values below can be changed to tweak the recommender algorithm,
        self.n_most_similar = 1
        self.n_features_title = 25
        self.n_features_content = 50
        self.n_features_tags = 25
        self.n_features_total = 30
        
        #Do not change the values below
        self.df = None
        self.df_article_vectors = None
        self.similarity_score_dict = {}
        self.X = None
        self.x_title = None
        self.X_content = None
        self.X_tags = None

In [19]:
def run(self):
    """
    Load and transform the TMT articles, train a content-based recommender system and make a recommendation for each
        TMT article.
        :return:
    """
    self.load_articles()
    self.assign_tags()
    self.vectorize_articles()
    self.reduce_dimensionality_articles()
    self.visualize_data()
    self.find_similar_articles()
    self.save_output_to_csv()

In [20]:
#Load data
def load_articles(Self):
    """"
    Loads the DataFrame with all the TMT articles. More info on this can be found in part 1 of the TMT
        recommender article series:
        www.themarketingtechnologist.co/building-a-recommendation-engine-for-geek-setting-up-the-prerequisites-13/
        :return: DataFrame with the title, content, tags and author of all TMT articles
    """
    self.df = pd.read_csv('articles.csv', encodings='utf-8') #Load articles in a dataframe
    self.df = self.df[['title','context_text','tags','author']] #Slice to remove redundant columns
    logging.debug("Number of articles: {0}\n".format(len(self.df)))

In [24]:
def assign_tags(self):
    """
        Assign one single tag (the first tag) to each article and create a list of all these tags
        :return: Appends a new column to the dataframe
    """
    def assign_single_tag(x):
        x = x.lower().split(",")[0]
        return x if x != "" else "None"
    # Clean up tags formatting
    self.df['tags'] = self.df['tags'].apply(lambda x: x.replace("[", "").replace("]", ""))
    #Assign first tag
    self.df['tags_first'] = self.df['tags'].apply(lambda x: assign_single_tag(x))

In [26]:
#Vectorize data and reduce dimensionality
def vectorize_articles(self):
    """
        Vectorize training data, i.e. perform a 3-gram feature extraction and selection method using FP, Chi or RP
        :return: Result is a numeric and weighted feature vector notation for each article
    """
    #vetorize articles
    self.vectorize_title() # Add title as dummies
    self.vectorize_content() # Add content as dummies
    self.vectorize_tags() # Add title as dummies
    # concatenate all articles vectors .i.e. title,content,tags,author
    article_metrics = (self.X_title, self.X_content, self.X_tags)
    self.X = np.concatenate(article_metrics, axis=1)
    logging.debug("Number of features in total Dataframe: {0}".format(self.X.shape[1]))

In [30]:
def get_vectorizer(self, ngram_range=(1,3), min_df=2, max_df=1.0):
    """
        Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
        :param ngram_range: n-grams are created for all numbers within this range
        :param min_df: min document frequency of features
        :param max_df: max document frequency of features
        :return:
     """
    vectorizer = CountVectorizer(ngram_range=ngram_range,
                                 tokenizer=self.tokenize,
                                 min_df=min_df,
                                 max_df=max_df,
                                 binary=True,
                                 stop_words='english')
    return vectorizer