# Create to connect to MySql koombea wp database

In [1]:
from io import StringIO
from paramiko import RSAKey, Ed25519Key, ECDSAKey, DSSKey, PKey
from cryptography.hazmat.primitives import serialization as crypto_serialization
from cryptography.hazmat.primitives.asymmetric import ed25519, dsa, rsa, ec

def from_private_key( file_obj, password = None ) -> PKey:
    private_key = None
    file_bytes = bytes( file_obj.read(), "utf-8" )
    try:
        key = crypto_serialization.load_ssh_private_key(
            file_bytes,
            password = password,
        )
        file_obj.seek( 0 )
    except ValueError:
        key = crypto_serialization.load_pem_private_key(
            file_bytes,
            password = password,
        )
        if password:
            encryption_algorithm = crypto_serialization.BestAvailableEncryption(
                password
            )
        else:
            encryption_algorithm = crypto_serialization.NoEncryption()
        file_obj = StringIO(
            key.private_bytes(
                crypto_serialization.Encoding.PEM,
                crypto_serialization.PrivateFormat.OpenSSH,
                encryption_algorithm,
            ).decode( "utf-8" )
        )
    if isinstance( key, rsa.RSAPrivateKey ):
        private_key = RSAKey.from_private_key( file_obj, password )
    elif isinstance( key, ed25519.Ed25519PrivateKey ):
        private_key = Ed25519Key.from_private_key( file_obj, password )
    elif isinstance( key, ec.EllipticCurvePrivateKey ):
        private_key = ECDSAKey.from_private_key( file_obj, password )
    elif isinstance( key, dsa.DSAPrivateKey ):
        private_key = DSSKey.from_private_key( file_obj, password )
    else:
        raise TypeError
    return private_key

import warnings
warnings.filterwarnings("ignore")

import numpy as np
from sqlalchemy import (create_engine)
import pandas as pd
from tqdm import tqdm
import json
import os

# Dir
model_dir = '/'.join(os.getcwd().split("/")[:-1]+["model"])
data_dir = '/'.join(os.getcwd().split("/")[:-1]+["data"])
CONFIG_DIRECTORY = '/'.join(os.getcwd().split("/")[:-1]+["config"])


# Select one staging
# STAGE = "dev"
STAGE = "prod"
TABLE_NAME = "wp_posts"

print("stage: "+STAGE)

DATABASE_CONFIG = {}
"""Configure database params given the stage"""

if STAGE == 'dev':
    DATABASE_CONFIG["PEM_FILE"] = CONFIG_DIRECTORY + "/" + "dataBaseKey.pem"
    DATABASE_CONFIG["HOSTNAME"] = "koombea20stg.ssh.wpengine.net"
    DATABASE_CONFIG["USERNAME"] = "koombea20stg"
    DATABASE_CONFIG['PASSWORD'] = 'opypHiPy2GiuCyApXQpZ'
    DATABASE_CONFIG["SSH_PORT"] = 22

    DATABASE_CONFIG['MYSQL_HOSTNAME'] = '127.0.0.1'
    DATABASE_CONFIG['MYSQL_PORT'] = 3306
    DATABASE_CONFIG['MYSQL_DBNAME'] = 'wp_koombea20stg'

    DATABASE_CONFIG["DEV"] = True

    DATABASE_CONFIG["PROD_HOSTNAME"] = "koombea20.ssh.wpengine.net"
    DATABASE_CONFIG["PROD_MYSQL_DBNAME"] = "wp_koombea20"
    DATABASE_CONFIG["PROD_USERNAME"] = "koombea20"
    DATABASE_CONFIG["PROD_PASSWORD"] = "-WFgRvi2dcg9HDx28JpA"

elif STAGE == "prod":
    DATABASE_CONFIG["PEM_FILE"] = CONFIG_DIRECTORY + "/" + "dataBaseKey.pem"
    DATABASE_CONFIG["HOSTNAME"] = "koombea20.ssh.wpengine.net"
    DATABASE_CONFIG["USERNAME"] = "koombea20"
    DATABASE_CONFIG["PASSWORD"] = "-WFgRvi2dcg9HDx28JpA"
    DATABASE_CONFIG["SSH_PORT"] = 22

    DATABASE_CONFIG['MYSQL_HOSTNAME'] = '127.0.0.1'
    DATABASE_CONFIG['MYSQL_PORT'] = 3306
    DATABASE_CONFIG["MYSQL_DBNAME"] = "wp_koombea20"

from sshtunnel import SSHTunnelForwarder

pkeyfilepath = DATABASE_CONFIG[ 'PEM_FILE' ]
pemFile = open(  pkeyfilepath, 'r' )

privateKey = from_private_key( pemFile, password = None )

tunnel = SSHTunnelForwarder(
    ( DATABASE_CONFIG[ 'HOSTNAME' ], DATABASE_CONFIG[ 'SSH_PORT' ] ),
    ssh_username = DATABASE_CONFIG[ 'USERNAME' ],
    ssh_pkey = privateKey,
    remote_bind_address = ( DATABASE_CONFIG[ 'MYSQL_HOSTNAME' ], DATABASE_CONFIG[ 'MYSQL_PORT' ] ), set_keepalive=2.0 )

stage: prod


# Start Connection

In [2]:
tunnel.start()

SQLALCHEMY_DATABASE_URL = "mysql+pymysql://{}:{}@{}:{}/{}".format( DATABASE_CONFIG[ 'USERNAME' ],
                                                            DATABASE_CONFIG[ 'PASSWORD' ],
                                                            DATABASE_CONFIG[ 'MYSQL_HOSTNAME' ],
                                                            tunnel.local_bind_port,
                                                            DATABASE_CONFIG[ 'MYSQL_DBNAME' ])

print("URL: " + SQLALCHEMY_DATABASE_URL)

engine = create_engine(
    SQLALCHEMY_DATABASE_URL,
    pool_pre_ping=True
)

conn = engine.connect()
# Read sql
tables_koombea = pd.read_sql("show tables;", conn)
tables_koombea.head()

URL: mysql+pymysql://koombea20:-WFgRvi2dcg9HDx28JpA@127.0.0.1:35869/wp_koombea20


Unnamed: 0,Tables_in_wp_koombea20
0,wp_actionscheduler_actions
1,wp_actionscheduler_claims
2,wp_actionscheduler_groups
3,wp_actionscheduler_logs
4,wp_aioseo_cache


# Select tablets

In [3]:
tables = {}
try:
    for table_name in tqdm(tables_koombea["Tables_in_wp_koombea20stg"].values, desc="Reading from dev"):
        tables[table_name] = pd.read_sql_table(table_name, conn)
except:
    for table_name in tqdm(tables_koombea["Tables_in_wp_koombea20"].values, desc="Reading from prod"):
        tables[table_name] = pd.read_sql_table(table_name, conn)

Reading from prod: 100%|██████████| 52/52 [00:31<00:00,  1.66it/s]


## Selecting not empty tables

In [4]:
new_tables = {}
for table_name, table in tables.items():
    if table.shape[0] == 0:
        print(table_name)
    else:
        new_tables[table_name] = table
tables = new_tables

wp_aiowps_debug_log
wp_aiowps_events
wp_aiowps_global_meta
wp_aiowps_permanent_block
wp_commentmeta
wp_comments
wp_import_detail_log
wp_import_log_detail
wp_import_postID
wp_links
wp_pmxe_exports
wp_pmxe_posts
wp_pmxe_templates
wp_redirection_404
wp_redirection_logs
wp_rtg_attachments
wp_smackcsv_file_events
wp_smackuci_events
wp_tm_taskmeta
wp_tm_tasks
wp_ultimate_csv_importer_acf_fields
wp_ultimate_csv_importer_mappingtemplate
wp_ultimate_csv_importer_media
wp_ultimate_csv_importer_shortcode_manager


# Posts Table

In [5]:
tables.keys()

dict_keys(['wp_actionscheduler_actions', 'wp_actionscheduler_claims', 'wp_actionscheduler_groups', 'wp_actionscheduler_logs', 'wp_aioseo_cache', 'wp_aioseo_notifications', 'wp_aioseo_posts', 'wp_aiowps_failed_logins', 'wp_aiowps_login_activity', 'wp_aiowps_login_lockdown', 'wp_options', 'wp_pmxe_google_cats', 'wp_pmxi_files', 'wp_pmxi_history', 'wp_pmxi_images', 'wp_pmxi_imports', 'wp_pmxi_posts', 'wp_pmxi_templates', 'wp_postmeta', 'wp_posts', 'wp_redirection_groups', 'wp_redirection_items', 'wp_term_relationships', 'wp_term_taxonomy', 'wp_termmeta', 'wp_terms', 'wp_usermeta', 'wp_users'])

In [6]:
posts = tables["wp_posts"]
# print(posts.post_type.unique())
# print(posts.columns)

In [7]:
services_info_ = dict((post.post_name, " ".join(post.post_name.split("-")) + " " + post.post_excerpt) for i, post in 
     posts[(posts.post_type == "services") & (posts.post_status == "publish")][["post_title","post_name", "post_excerpt"]].iterrows())
services_info_

{'mvp-software-development': 'mvp software development Launch an MVP to validate product market fit, prioritizing time to market and essential features.',
 'product-ideation': 'product ideation Determine the viability of your innovation and receive a detailed work breakdown structure (WBS).',
 'quality-assurance': 'quality assurance Functional, Automation, and Performance testing through Continuous Integration.',
 'product-design': 'product design User Experience (UX), UI design, Wireframing, Information Architecture, and User flows.',
 'ecommerce-development-solutions': 'ecommerce development solutions Shopify and WooCommerce custom designs and integrations that help streamline your business processes.',
 'project-management': 'project management Agile Development and Agile Project Management with certified Scrum Project Managers.',
 'devops': 'devops Technology expertise using Kubernetes, AWS, Heroku, Docker, Terraform & more.',
 'cross-platform-app-development': 'cross platform app 

In [8]:
industries_info_ = dict((post.post_name, " ".join(post.post_name.split("-")) + " " + post.post_excerpt) for i, post in 
     posts[(posts.post_type == "industries") & (posts.post_status == "publish")][["post_name", "post_title", "post_excerpt"]].iterrows())
industries_info_

{'fintech-app-development-services': 'fintech app development services ',
 'custom-healthcare-software-development-services': 'custom healthcare software development services ',
 'hitech': 'hitech ',
 'retail-app-development-services': 'retail app development services ',
 'iot-app-development-services': 'iot app development services ',
 'educational-app-development-services': 'educational app development services '}

# Select the blogs to generate the model

We will select blogs post from `wp_posts` table with column value for `post_type=post` and `post_status=publish`, and select just the following columns:

    - post_content: content
    - post_title: title
    - post_name: slug

In [9]:
blogs = posts[(posts.post_type == "post") & (posts.post_status == "publish")].copy()
blogs_columns = ["ID", "post_content", "post_title", "post_name", 
                 "post_author", "post_date"]
blogs = blogs[blogs_columns].copy()
blogs.head()

Unnamed: 0,ID,post_content,post_title,post_name,post_author,post_date
84,2869,"<span style=""font-weight: 400;"">Are you curiou...",Scalable Applications: Curious Why Scalability...,why-scalability-matters-for-your-app,8,2021-03-22 09:00:00
87,2873,<strong>A Brief History of Scrum Agile Develop...,Is a Scrum Agile Development Process Right for...,is-a-scrum-agile-development-process-right-for...,8,2014-09-15 00:00:00
90,2876,Guessing is a part of life—and it’s necessary....,How to Make Your App Irresistible Through User...,how-to-make-your-app-irresistible-through-user...,8,2014-09-08 00:00:00
91,2877,Product management is a tough job. You must tr...,5 Product Manager MUSTS for Creating a Success...,5-product-manager-musts-for-creating-a-success...,8,2014-08-26 00:00:00
92,2878,"<span style=""font-weight: 400;"">From the momen...",7 Key Questions to Ask Your Prospective App De...,7-key-questions-to-ask-prospective-app-develop...,8,2014-08-12 00:00:00


In [10]:
blogs.shape

(989, 6)

In [11]:
NUM_BLOGS = blogs.shape[0]
print("there are {} blogs publish on koombea_{} staging".format(NUM_BLOGS,
                                               STAGE))

there are 989 blogs publish on koombea_prod staging


# Reset Index

In [12]:
blogs.reset_index(drop=True, inplace=True)

In [13]:
blogs.head()

Unnamed: 0,ID,post_content,post_title,post_name,post_author,post_date
0,2869,"<span style=""font-weight: 400;"">Are you curiou...",Scalable Applications: Curious Why Scalability...,why-scalability-matters-for-your-app,8,2021-03-22 09:00:00
1,2873,<strong>A Brief History of Scrum Agile Develop...,Is a Scrum Agile Development Process Right for...,is-a-scrum-agile-development-process-right-for...,8,2014-09-15 00:00:00
2,2876,Guessing is a part of life—and it’s necessary....,How to Make Your App Irresistible Through User...,how-to-make-your-app-irresistible-through-user...,8,2014-09-08 00:00:00
3,2877,Product management is a tough job. You must tr...,5 Product Manager MUSTS for Creating a Success...,5-product-manager-musts-for-creating-a-success...,8,2014-08-26 00:00:00
4,2878,"<span style=""font-weight: 400;"">From the momen...",7 Key Questions to Ask Your Prospective App De...,7-key-questions-to-ask-prospective-app-develop...,8,2014-08-12 00:00:00


# Map Columns

    - post_name : slug
    - post_title: title
    - post_content: content

In [14]:
blogs.columns

Index(['ID', 'post_content', 'post_title', 'post_name', 'post_author',
       'post_date'],
      dtype='object')

In [15]:
blogs.columns = ["id", "content", "title", "slug", "post_author", "post_date"]

In [16]:
blogs.head()

Unnamed: 0,id,content,title,slug,post_author,post_date
0,2869,"<span style=""font-weight: 400;"">Are you curiou...",Scalable Applications: Curious Why Scalability...,why-scalability-matters-for-your-app,8,2021-03-22 09:00:00
1,2873,<strong>A Brief History of Scrum Agile Develop...,Is a Scrum Agile Development Process Right for...,is-a-scrum-agile-development-process-right-for...,8,2014-09-15 00:00:00
2,2876,Guessing is a part of life—and it’s necessary....,How to Make Your App Irresistible Through User...,how-to-make-your-app-irresistible-through-user...,8,2014-09-08 00:00:00
3,2877,Product management is a tough job. You must tr...,5 Product Manager MUSTS for Creating a Success...,5-product-manager-musts-for-creating-a-success...,8,2014-08-26 00:00:00
4,2878,"<span style=""font-weight: 400;"">From the momen...",7 Key Questions to Ask Your Prospective App De...,7-key-questions-to-ask-prospective-app-develop...,8,2014-08-12 00:00:00


# Convert Date

In [17]:
def convert_date(date):
    return date.strftime("%b %d, %Y")

blogs["post_date_str"] = blogs["post_date"].apply(convert_date)
# blogs["post_modified_str"] = blogs["post_modified"].apply(convert_date)

In [18]:
blogs.head(3)

Unnamed: 0,id,content,title,slug,post_author,post_date,post_date_str
0,2869,"<span style=""font-weight: 400;"">Are you curiou...",Scalable Applications: Curious Why Scalability...,why-scalability-matters-for-your-app,8,2021-03-22 09:00:00,"Mar 22, 2021"
1,2873,<strong>A Brief History of Scrum Agile Develop...,Is a Scrum Agile Development Process Right for...,is-a-scrum-agile-development-process-right-for...,8,2014-09-15 00:00:00,"Sep 15, 2014"
2,2876,Guessing is a part of life—and it’s necessary....,How to Make Your App Irresistible Through User...,how-to-make-your-app-irresistible-through-user...,8,2014-09-08 00:00:00,"Sep 08, 2014"


# Get extra information
- industry slug
- dates
- post_author

## Get Terms of Industry

In [19]:
terms_temp = new_tables["wp_terms"]
terms_taxonomy_temp = new_tables["wp_term_taxonomy"]
terms_blogs_relate_temp = new_tables["wp_term_relationships"]

In [20]:
terms_taxonomy_temp["taxonomy"].unique()

array(['nav_menu', 'category', 'post_tag', 'industry', 'buying_stage',
       'persona', 'lune_section_tax', 'wp_theme'], dtype=object)

In [21]:
industries_table_temp = terms_taxonomy_temp.groupby('taxonomy').get_group('industry')
term_ids = industries_table_temp.term_id.tolist()

In [22]:
industries_terms = terms_temp[terms_temp.term_id.isin(term_ids)]
industries_terms

Unnamed: 0,term_id,name,slug,term_group,term_order
6,291,HiTech,hi-tech,0,0
16,301,All Industries,all-industries,0,0
22,307,Other,other,0,0
24,309,App Development,app-development,0,0
37,322,Koombea Culture,koombea-culture,0,0
41,326,FinTech,fintech,0,0
43,328,Retail,retail,0,0
48,333,IoT,iot,0,0
50,335,MedTech,medtech,0,0
113,398,Website Development,website-development,0,0


# Fix primary industry term

In [23]:
tables.keys()

dict_keys(['wp_actionscheduler_actions', 'wp_actionscheduler_claims', 'wp_actionscheduler_groups', 'wp_actionscheduler_logs', 'wp_aioseo_cache', 'wp_aioseo_notifications', 'wp_aioseo_posts', 'wp_aiowps_failed_logins', 'wp_aiowps_login_activity', 'wp_aiowps_login_lockdown', 'wp_options', 'wp_pmxe_google_cats', 'wp_pmxi_files', 'wp_pmxi_history', 'wp_pmxi_images', 'wp_pmxi_imports', 'wp_pmxi_posts', 'wp_pmxi_templates', 'wp_postmeta', 'wp_posts', 'wp_redirection_groups', 'wp_redirection_items', 'wp_term_relationships', 'wp_term_taxonomy', 'wp_termmeta', 'wp_terms', 'wp_usermeta', 'wp_users'])

In [24]:
tables["wp_postmeta"].head(100)

Unnamed: 0,meta_id,post_id,meta_key,meta_value
0,2,3,_wp_page_template,default
1,137,26,_edit_lock,1673230644:24
2,138,27,_edit_lock,1659576419:24
3,139,28,_edit_lock,1659026246:22
4,140,29,_edit_lock,1593103730:9
...,...,...,...,...
95,305,62,_edit_lock,1661356429:2
96,308,62,_edit_last,2
97,309,62,_genesis_title,Retail App Development Services&#x2d; Solution...
98,310,62,_genesis_description,Koombea is the leading mobile and web app deve...


# Get relationship blog-id and term-id

In [25]:
terms_blogs_relate_temp.head()

Unnamed: 0,object_id,term_taxonomy_id,term_order
0,35,5,0
1,36,5,0
2,37,5,0
3,38,5,0
4,39,5,0


In [26]:
industry_names, industry_slugs = [], []
# for index, blog in tqdm(blogs.iterrows(), total=blogs.shape[0]):
for index, blog in blogs.iterrows():
    id_blog = blog.id
    term_id = terms_blogs_relate_temp[terms_blogs_relate_temp.object_id == id_blog].term_taxonomy_id.tolist()
    term_name = industries_terms[industries_terms.term_id.isin(term_id)].name.tolist()
    term_slug = industries_terms[industries_terms.term_id.isin(term_id)].slug.tolist()
    # print(blog.slug, term_slug)
    industry_names.append(term_name[0] if term_name else "other")
    industry_slugs.append(term_slug[0] if term_slug else "Other")

In [27]:
blogs["term_slug"] = industry_slugs
blogs["term_name"] = industry_names

# Get blogs author name

I want to filter out the guest author for the industries recommendation

In [28]:
authors_temp = new_tables["wp_users"]
authors_temp.head()

Unnamed: 0,ID,user_login,user_pass,user_nicename,user_email,user_url,user_registered,user_activation_key,user_status,display_name,user_order
0,1,wpengine,$P$BrV/mS49FoI4VQFdksEqdvORPrjYSI1,wpengine,bitbucket@wpengine.com,http://wpengine.com,2020-02-26 15:15:45,,0,wpengine,0
1,2,koombea2020,$P$B0MzMqMseE7RkNvy8ND0pcQP6Ee5nm1,koombea2020,wp@koombea.com,https://www.koombea.com,2020-02-26 15:31:45,,0,koombea2020,0
2,4,Rhonalf Martinez,$P$BPsuoYWMtOxUaMus3E6p7c.r5D1ZkG0,rhonalf-martinez,rhonalf.martinez@koombea.com,,2017-09-19 00:00:00,,0,Rhonalf Martinez,0
3,6,jaime,$P$BUP6FIC/he30GAEEuAn..yFaKa0BXV/,jaime,jaime.vengoechea@koombea.com,,2018-05-04 00:00:00,,0,Jaime vengoechea,0
4,7,jtarud,$P$BI6CALypl3ojTt7opK3ZCckOJDYt.y0,jtarud,jonathan@koombea.com,,2018-05-15 00:00:00,,0,Jonathan Tarud,0


In [29]:
author_names, author_slug_names = [], []
for index, blog in blogs.iterrows():
    author_name = authors_temp[authors_temp.ID == blog.post_author]["display_name"].tolist()[0]
    author_slug_name = authors_temp[authors_temp.ID == blog.post_author]["user_nicename"].tolist()[0]
    author_names.append(author_name)
    author_slug_names.append(author_slug_name)

In [30]:
blogs["author_name"] = author_names
blogs["author_slug_name"] = author_slug_names

In [31]:
blogs.head(2)

Unnamed: 0,id,content,title,slug,post_author,post_date,post_date_str,term_slug,term_name,author_name,author_slug_name
0,2869,"<span style=""font-weight: 400;"">Are you curiou...",Scalable Applications: Curious Why Scalability...,why-scalability-matters-for-your-app,8,2021-03-22 09:00:00,"Mar 22, 2021",hi-tech,HiTech,Robert Kazmi,robertkazmi
1,2873,<strong>A Brief History of Scrum Agile Develop...,Is a Scrum Agile Development Process Right for...,is-a-scrum-agile-development-process-right-for...,8,2014-09-15 00:00:00,"Sep 15, 2014",hi-tech,HiTech,Robert Kazmi,robertkazmi


In [32]:
map_id_slug_author = dict((author.ID, author.user_nicename) 
                          for i, author in authors_temp.iterrows())
map_id_slug_author

{1: 'wpengine',
 2: 'koombea2020',
 4: 'rhonalf-martinez',
 6: 'jaime',
 7: 'jtarud',
 8: 'robertkazmi',
 9: 'fabian-altahona',
 12: 'david-bohorquez',
 14: 'mario-tatis',
 15: 'jose-gomez',
 16: 'alvaro-insignares',
 19: 'guest-author',
 21: 'jonathan-visbal',
 22: 'fabio-fonseca',
 23: 'ricardo-lapeira',
 24: 'natalia-novikova',
 26: 'nathan-mackowiak',
 27: 'tonymeazellkoombeacom',
 28: 'luz-perdomo',
 29: 'milton-casanovakoombea-com'}

# Clean HTML from content

In [33]:
from bs4 import BeautifulSoup

In [34]:
def clean_html(html_text):
    soup = BeautifulSoup(html_text, "html.parser")
    return soup.get_text()

In [35]:
tqdm.pandas(desc="cleaning html content")
blogs["content"] = blogs["content"].progress_apply(clean_html)

cleaning html content: 100%|██████████| 989/989 [00:02<00:00, 352.06it/s]


# Joining Titles + Contents

In [36]:
blogs["train_data"] = blogs["slug"].apply(lambda x: " ".join(x.split("-"))) + " " +\
                        blogs["content"]

# Getting Language

In [37]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

def detect_language(text):
    return 'es' if detect(text) != 'en' else 'en'

In [38]:
tqdm.pandas(desc="Detecting language")
blogs["lang"] = blogs["train_data"].progress_apply(detect_language)

Detecting language: 100%|██████████| 989/989 [00:21<00:00, 46.14it/s]


# Split the data between Spanish Data and English data

In [39]:
data_en = blogs[blogs.lang == "en"]
data_en.reset_index(drop=True, inplace=True)

In [40]:
data_es = blogs[blogs.lang == "es"]
data_es.reset_index(drop=True, inplace=True)

# Preprocess DataSet

In [41]:
import spacy
import re

from stop_words import get_stop_words
from nltk.corpus import stopwords

stop_words = list(get_stop_words('en')) + list(get_stop_words('es')) + ['app']
nltk_words = list(stopwords.words('english')) + list(stopwords.words('spanish')) + ['app']
stop_words.extend(nltk_words)

NLP_EN = spacy.load("en_core_web_sm")
NLP_ES = spacy.load("es_core_news_sm")

new_stops_words = ["y", "a", "en"]
for word in new_stops_words:
    lexeme = NLP_ES.vocab[word]
    lexeme.is_stop = True
    
def remove_unneccesary_whitespace(text):
    return ' '.join(text.split())

def basic_normalize(text, patterns_dict):
    text = text.lower()
    for pattern_re, replaced_str in patterns_dict:
        text = pattern_re.sub(replaced_str, text)
    return text

def get_regex_expression():
    # Basic normalization
    _patterns_ = [r'\'',
             r'\"',
             r'\.',
             r'<br \/>',
             r',',
             r'\(',
             r'\)',
             r'\!',
             r'\?',
             r'\;',
             r'\:',
             r'\s+']

    _replacements_ = [' \'  ',
                     '',
                     ' . ',
                     ' ',
                     ' , ',
                     ' ( ',
                     ' ) ',
                     ' ! ',
                     ' ? ',
                     ' ',
                     ' ',
                     ' ']

    # Match non alphanumeric characters
    NON_ALPHANUMERIC_REGEX = r'[^a-zA-Z0-9À-ÿ\u00f1\u00d1\s]'
    # Match any link or url from text
    LINKS_REGEX = r'https?:\/\/.*[\r\n]'
    # Match hashtags
    HASHTAGS_REGEX = r'\#[^\s]*'
    # Match twitter accounts
    TWITTER_ACCOUNTS_REGEX = r'\@[^\s]*'
    # Match Author:
    AUTHOR_REGEX = r'author'
    # Match email
    EMAIL_REGEX = r"\S*@\S+"
    # Group regex
    _patterns = [LINKS_REGEX,
                 HASHTAGS_REGEX,
                 TWITTER_ACCOUNTS_REGEX,
                 AUTHOR_REGEX,
                 EMAIL_REGEX,
                 NON_ALPHANUMERIC_REGEX]
    _replacements = [' ']*(len(_patterns))
    _patterns += _patterns_
    _replacements += _replacements_
    _patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))
    return _patterns_dict

def preprocess_data(text, lang, patterns_dict, 
                    removing_stops=False, lemmatize=False):
    # Clean text
    text = remove_unneccesary_whitespace(basic_normalize(text, patterns_dict))
    # Choose the right tokenizer
    NLP = NLP_EN if lang == 'en' else NLP_ES
    # Tokenize the text of the blogs
    tokens = NLP(text)
    # Remove all punctuation marks
    tokens = [token for token in tokens if not token.is_punct]
    # Remove numbers or amount representation
    tokens = [token for token in tokens if not token.like_num]
    if removing_stops:
        # Remove stopswords
        tokens = [token for token in tokens if not token.is_stop]
        tokens = [token for token in tokens if token.text not in stop_words]
    if lemmatize:
        # Lemmatize words
        tokens = [token.lemma_.strip() for token in tokens]
    else:
        # Convert to str and lowerize
        tokens = [token.text.strip() for token in tokens]
        
    return tokens

In [42]:
patterns_dict = get_regex_expression()
print(patterns_dict)

[(re.compile('https?:\\/\\/.*[\\r\\n]'), ' '), (re.compile('\\#[^\\s]*'), ' '), (re.compile('\\@[^\\s]*'), ' '), (re.compile('author'), ' '), (re.compile('\\S*@\\S+'), ' '), (re.compile('[^a-zA-Z0-9À-ÿ\\u00f1\\u00d1\\s]'), ' '), (re.compile("\\'"), " '  "), (re.compile('\\"'), ''), (re.compile('\\.'), ' . '), (re.compile('<br \\/>'), ' '), (re.compile(','), ' , '), (re.compile('\\('), ' ( '), (re.compile('\\)'), ' ) '), (re.compile('\\!'), ' ! '), (re.compile('\\?'), ' ? '), (re.compile('\\;'), ' '), (re.compile('\\:'), ' '), (re.compile('\\s+'), ' ')]


In [43]:
tqdm.pandas(desc="preprocessing english data")
data_en["clean_data"] = data_en["train_data"].progress_apply(
    lambda x: preprocess_data(x, "en", patterns_dict,
                             removing_stops=True,
                             lemmatize=True)
) 

preprocessing english data: 100%|██████████| 979/979 [02:38<00:00,  6.18it/s]


In [44]:
services_info_preprocess = list(map(lambda x: preprocess_data(x, "en", patterns_dict, 
                                                        removing_stops=True,
                                                        lemmatize=True),
                              list(services_info_.values())))
services_info = dict((service_name, service_inf) for service_name, service_inf in zip(services_info_.keys(), 
                                                                                      services_info_preprocess))

industries_info_preprocess = list(map(lambda x: preprocess_data(x, "en", patterns_dict,
                                                               removing_stops=True,
                                                               lemmatize=True),
                                     list(industries_info_.values())))
industries_info = dict((industry_name, industry_inf) for industry_name, industry_inf in zip(industries_info_.keys(),
                                                                                           industries_info_preprocess))

json.dump(services_info, open(os.path.join(model_dir, "services_info.json"), "w"))
json.dump(industries_info, open(os.path.join(model_dir, "industries_info.json"), "w"))

In [45]:
tqdm.pandas(desc="preprocessing spanish data")
data_es["clean_data"] = data_es["train_data"].progress_apply(
    lambda x: preprocess_data(x, "es", patterns_dict,
                             removing_stops=True,
                             lemmatize=True)
) 

preprocessing spanish data: 100%|██████████| 10/10 [00:01<00:00,  7.53it/s]


# Generate n_grams model

In [46]:
from gensim.models.phrases import Phrases, Phraser

In [47]:
def train_ngram(data, ngram=1, **kwargs):
    ngram_model = None
    for i in range(1, ngram):
        ngram_model = Phrases(data, **kwargs)
        ngram_model = Phraser(ngram_model)
        data = list(ngram_model[data])
    if ngram_model is None:
        return data, None
    else:
        return data, ngram_model

In [48]:
ngram=2

bigrams_clean_data_es, phrases_model_es = train_ngram(data_es["clean_data"].tolist(),
                                                      ngram=ngram, threshold=4)
bigrams_clean_data_en, phrases_model_en = train_ngram(data_en["clean_data"].tolist(),
                                                      ngram=ngram, threshold=4)

# Save ngrams model

In [49]:
# Data Dir

if not(phrases_model_es is None and phrases_model_en is None):
    phrases_model_es.save(os.path.join(model_dir, 'phrases_model_es.pickle'))
    phrases_model_en.save(os.path.join(model_dir, 'phrases_model_en.pickle'))

# Save Data For Training

In [50]:
import json
# Save clean_data_en
json.dump(bigrams_clean_data_en, open(os.path.join(data_dir, 'train_data_en.json'), 'w'))
# Save clean_data_es
json.dump(bigrams_clean_data_es, open(os.path.join(data_dir, 'train_data_es.json'), 'w'))

In [51]:
# index_slug_en = [_data[2] for index, _data in enumerate(data_en)]
# index_slug_es = [_data[2] for index, _data in enumerate(data_es)]

In [52]:
# Save data_en
data_en[data_en.columns[:-3]].to_json(os.path.join(data_dir, "data_en.json"))
data_en[data_en.columns[:-3]].to_json(os.path.join(model_dir, "data_en.json"))

# Save data_es
data_es[data_es.columns[:-3]].to_json(os.path.join(data_dir, "data_es.json"))
data_es[data_es.columns[:-3]].to_json(os.path.join(model_dir, "data_es.json"))

In [53]:
# Save index_slug_es
json.dump(data_es["slug"].tolist(), open(os.path.join(data_dir, 'index_slug_es.json'), 'w'))
json.dump(data_es["slug"].tolist(), open(os.path.join(model_dir, 'index_slug_es.json'), 'w'))

# # Save index_slug_en
json.dump(data_en["slug"].tolist(), open(os.path.join(data_dir, 'index_slug_en.json'), 'w'))
json.dump(data_en["slug"].tolist(), open(os.path.join(model_dir, 'index_slug_en.json'), 'w'))

# Save Trending results

In [54]:
trending = pd.read_csv(os.path.join(data_dir, 'trending.csv'))
trending.head()

Unnamed: 0,slug,sum_unique_page_views,sum_avg_time,post_date,pageRank
0,web-development-goals,61,6.043388,2023-01-05 10:32:32,20.444283
1,software-architect-vs-software-engineer,250,5.165143,2022-08-03 11:58:46,19.471339
2,i-have-an-idea-for-an-app-now-what,144,5.297786,2022-09-22 17:00:00,19.280756
3,most-profitable-apps,56,5.588185,2023-01-13 11:49:00,19.13115
4,how-to-start-a-fintech-company,58,5.533839,2022-07-06 10:09:00,19.018902


In [55]:
trending_slug = trending.slug.tolist()[:20]
json.dump(trending_slug, open(os.path.join(model_dir, 'trending_result.json'), 'w'))

# Save popular results

In [56]:
popular = pd.read_csv(os.path.join(data_dir, 'popularity.csv'))
popular.head()

Unnamed: 0,slug,sum_unique_page_views,sum_avg_time,pageRank
0,how-to-know-if-your-website-isnt-ready-for-pri...,1,6.853299,55.51954
1,what-are-the-costs-when-it-comes-to-building-a...,2,6.556778,53.552839
2,how-to-vet-for-the-right-app-development-partner,1,6.565265,53.215267
3,penetration-testing-vs-vulnerability-scanning,1,6.466145,52.422305
4,data-privacy-and-security-concerns-hinder-digi...,5,6.320169,52.353108


In [57]:
popular_slug = popular.slug.tolist()[:20]

In [58]:
json.dump(popular_slug, open(os.path.join(model_dir, 'popular_result.json'), 'w'))

# Close tunnel

In [59]:
tunnel.close()