In [1]:
import pandas as pd
import numpy as np
import re #regex
import time
from unicodedata import normalize
import requests as rq
import bs4 as bs4 #beautifulsoup4
import json
import tqdm
import glob
import joblib as jb
import matplotlib.pyplot as plt

# NLP tools
from more_itertools import unique_everseen
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer # Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF
from scipy.sparse import hstack

stop_pt = stopwords.words('portuguese')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

# Model optimization
from skopt import forest_minimize

In [501]:
pd.set_option("max.columns", 100)
pd.set_option('display.min_rows', 80)

This project is based on the YouTube Video Recommender project designed and taught by Mário Filho on his Hotmart course  
I decided to do something similar but trying to predict events that I would like to attend in São Paulo. For that, I collected information from events from two ticket-selling aggregator websites: Sympla and Eventbrite, manually labeled them and fit several models to evaluate which one had better performance.

## DATA COLLECTION

#### Collecting search pages - Sympla

In [1]:
# Request headers
headers_sympla = {'authority': 'www.sympla.com.br',
           'method': 'GET',
           'path': '/eventos/sao-paulo-sp?ordem=data&pagina=2&value=sao-paulo-sp',
           'scheme': 'https',
           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 
           'accept-encoding': 'gzip, deflate, br',
           'accept-language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
            'sec-fetch-dest': 'document',
           'sec-fetch-mode': 'navigate',
           'sec-fetch-site': 'none',
           'upgrade-insecure-requests': '1',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
          }

In [2]:
url_sympla = 'https://www.sympla.com.br/eventos/sao-paulo-sp?ordem=data&pagina={page}&value=sao-paulo-sp'

In [None]:
# When I ran this code, there were 49 search pages available
for page in range(1,50):
    url = url_sympla.format(page=page)
    print(url)
    response = rq.get(url, headers=headers_sympla)
        
    with open("./Raw_Data/Sympla_{}.html".format(page), 'w+', encoding='utf-8') as output:
        output.write(response.text)
    time.sleep(5)

In [210]:
for page in range(1,50):
    with open("./Raw_Data/Sympla_{}.html".format(page), \
              'r+', encoding='utf-8') as inp:
        html_page = inp.read()
        parsed_html = bs4.BeautifulSoup(html_page)
        tags = parsed_html.find_all('a', attrs={'class': 'sympla-card card-normal w-inline-block'})
        
        for t in tags:
            # Link
            link = t['href']
            
            # Name
            name_aux = t.find('div', attrs={'class': 'event-name event-card'}).text
            name = re.sub(' +', ' ', name_aux)
            
            # Location
            location_aux = t.find('div', attrs={'class': 'event-location event-card'}).text.strip()
            location = re.sub(' +', ' ', location_aux)
            
            # Date
            if t.find('div', attrs={'class': 'event-date-day'}) != None:
                event_day = t.find('div', attrs={'class': 'event-date-day'}).text
                event_month = t.find('div', attrs={'class': 'event-card-date-month'}).text
                date = " ".join([event_day, event_month])
            else:
                date = np.nan
            
            with open("./parsed_events_sympla.json", \
                          'a+', encoding='utf-8') as outp:
                event_data = {"link": link, "name": name, "location": location, 'date': date}
                outp.write("{}\n".format(json.dumps(event_data, ensure_ascii=False)))

#### Collecting search pages - Eventbrite (EB)

In [104]:
url_eventbrite = 'https://www.eventbrite.com.br/d/brazil--s%C3%A3o-paulo/all-events/?page={page}'

In [None]:
# When I ran this code, there were 43 search pages available
for page in range(1,44):
    url = url_eventbrite.format(page=page)
    print(url)
    response = rq.get(url)
        
    with open("./Raw_Data/Eventbrite_{}.html".format(page), 'w+', encoding='utf-8') as output:
        output.write(response.text)
    time.sleep(5)

In [214]:
for page in range(1,44):
    with open("./Raw_Data/Eventbrite_{}.html".format(page), \
              'r+', encoding='utf-8') as inp:
        html_page = inp.read()
        parsed_html = bs4.BeautifulSoup(html_page)
        tags = parsed_html.find_all('div', attrs={'class': 'eds-event-card-content__content'})
        
        for t in tags:
            # Link
            link = t.find('a', attrs={'class': 'eds-event-card-content__action-link'})['href']
            
            # Name
            name_aux = t.find('div', attrs={'class': 'eds-is-hidden-accessible'}).text
            name = re.sub(' +', ' ', name_aux)
            
            # Location
            location_aux = t.find('div', attrs={'class': 'card-text--truncated__one'}).text
            location = re.sub(' +', ' ', location_aux)
            
            # Date
            date_aux = t.find('div', attrs={'class': \
                                                    'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs'}).text
            date = re.sub(' +', ' ', date_aux)
        
            with open("./parsed_events_eb.json", \
                          'a+', encoding='utf-8') as outp:
                event_data = {"link": link, "name": name, "location": location, "date": date}
                outp.write("{}\n".format(json.dumps(event_data, ensure_ascii=False)))

### Checking search pages data

In [233]:
events_sympla = pd.read_json("./parsed_events_sympla.json", lines=True)
events_sympla = events_sympla.drop_duplicates().reset_index(drop=True)
events_sympla.shape

(1018, 4)

In [163]:
events_eb = pd.read_json("./parsed_events_eb.json", lines=True)
events_eb = events_eb.drop_duplicates().reset_index(drop=True)
events_eb.shape

(729, 4)

In [164]:
events_eb.head()

Unnamed: 0,link,name,location,date
0,https://www.eventbrite.com.br/e/conexao-teen-m...,CONEXÃO TEEN - MINISTROS,"Renascer Hall • Mooca, SP","sáb, out 24, 14:30"
1,https://www.eventbrite.com.br/e/mini-bazar-clo...,Mini Bazar Clozét (HORA MARCADA E ENTRADA CONT...,"Rua Antônio Salvia, 252 • Parque Maria Helena, SP","sáb, out 24, 09:00 + 7 eventos mais"
2,https://www.eventbrite.com.br/e/hub-surrender-...,HUB SURRENDER - 24/10 - 19h,"Harvest Field Church • Barra Funda, SP","sáb, out 24, 19:00"
3,https://www.eventbrite.com/e/dna-diferente-32-...,DNA DIFERENTE (32 ),Holiday Inn Sao Paulo Parque Anhembi • Parque ...,"sáb, nov 21, 08:00"
4,https://www.eventbrite.com/e/culto-iasd-pinhei...,Culto IASD Pinheiros,"R. Cláudio Soares, 167 • Pinheiros, SP","sáb, out 24, 09:00 + 42 eventos mais"


### Collecting event pages

In [395]:
def save_events_pages(links_series, platform, headers=None):
    for url in links_series:
        print(url)
        response = rq.get(url, headers=headers)
        
        if platform == 'eventbrite':
            link_name = re.search("e/(.*)\?", url).group(1)
            files_location = "./Raw_Data/Events_EB/{}.html"

        elif platform == 'sympla':
            if re.search("bileto", url) != None:
                continue
            else:
                link_name = re.search("br/(.*)", url).group(1)
            
            files_location = "./Raw_Data/Events_Sympla/{}.html"
            
        with open(files_location.format(link_name), 'w+', encoding='utf-8') as outp:
            outp.write(response.text)
            time.sleep(1)

In [237]:
links_series_eb = events_eb['link']
links_series_sympla = events_sympla['link']
save_events_pages(links_series_eb, platform='eventbrite')
save_events_pages(links_series_sympla, platform='sympla', headers=headers_sympla)

#### Parsing event pages - EB

In [417]:
with open("./parsed_events_info_eb.json", 'w+', encoding='utf-8') as output:
    for event_html in tqdm.tqdm(glob.glob("./Raw_Data/Events_EB/*")):
        with open(event_html, 'r+', encoding='utf-8') as inp:
            html_page = inp.read()
            parsed_html = bs4.BeautifulSoup(html_page, 'html.parser')

            # Name, only 1
            name = None
            if parsed_html.find('h1', attrs={"class":re.compile(r"listing")}) != None:
                name = parsed_html.find('h1', attrs={"class":re.compile(r"listing")}).text.strip()
            # Organizer, only 1
            organizer = parsed_html.find('div', attrs={"class":re.compile(r"title")}).text.strip()  
            # Description, several
            content_cols = parsed_html.find_all('div', attrs={"class":re.compile(r"content")})
            # Dates, several
            time_cols = parsed_html.find_all('p', attrs={'class', re.compile(r"time")})
            # Location, several
            location_cols = parsed_html.find_all("div", attrs={'class': 'event-details__data'})
            # Labels, several
            labels_aux = parsed_html.find_all('a', attrs={'class': 'js-d-track-link listing-tag badge badge--tag l-mar-top-2'})
            # Price, only 1
            price = None
            if parsed_html.find("div", attrs={'class': 'js-display-price'}) != None:
                price = parsed_html.find("div", attrs={'class': 'js-display-price'}).text.strip()
            # Other info, only 1
            info = None
            if parsed_html.find("div", attrs={'class': 'listing-panel-info__sm-text-center'}) != None:
                info = parsed_html.find("div", attrs={'class': 'listing-panel-info__sm-text-center'}).text.strip()

            event_data = dict()
            event_data['name'] = name
            event_data['organizer'] = organizer
            event_data['price'] = price
            event_data['info'] = info
            labels = [l.text.strip() for l in labels_aux]
            event_data['labels'] = " ".join(labels)

            # Creating several columns
            for c in content_cols:
                col_name = "_".join(c['class'])
                event_data[col_name] = c.get_text(" ").strip()

            for c in time_cols:
                col_name = "_".join(c['class'])
                event_data[col_name] = c.get_text(" ").strip()

            for c in location_cols:
                col_name = "_".join(c['class'])
                event_data[col_name] = c.get_text(" ").strip()

            output.write("{}\n".format(json.dumps(event_data, ensure_ascii=False)))

100%|██████████| 729/729 [01:02<00:00, 11.65it/s]


#### Parsing event pages - Sympla

In [631]:
with open("./parsed_events_info_sympla.json", 'w+', encoding='utf-8') as output:
    for event_html in tqdm.tqdm(glob.glob("./Raw_Data/Events_Sympla/*")):
        with open(event_html, 'r+', encoding='utf-8') as inp:
            html_page = inp.read()
            parsed_html = bs4.BeautifulSoup(html_page, 'html.parser')
            
            # Name
            name = parsed_html.find('h1').text.strip()
            # Dates
            date = None
            if parsed_html.find('div', attrs={"class":'event-info-calendar'}) != None:
                date = parsed_html.find('div', attrs={"class":'event-info-calendar'}).text.strip()
            # Location
            location = None
            if parsed_html.find('div', attrs={"class":'event-info-city'}) != None:
                location = parsed_html.find('div', attrs={"class":'event-info-city'}).text.strip()
            # Organizer
            organizer = None
            if parsed_html.find('div', attrs={"id":'produtor'}) != None:
                org_aux = parsed_html.find('div', attrs={"id":'produtor'})
                if org_aux.find('h4', attrs={"class":re.compile(r"kill")}) != None:
                    organizer = org_aux.find('h4', attrs={"class":re.compile(r"kill")}).text.strip()
            # Organizer description
            org_desc = None
            if org_aux.find('p', attrs={"id":'org-description'}) != None:
                org_desc = org_aux.find('p', attrs={"id":'org-description'}).text.strip()
                org_desc = re.sub(' +', ' ', org_desc)
            # Description
            description = None
            if parsed_html.find('div', attrs={"id":'event-description'}) != None:
                description = parsed_html.find('div', attrs={"id":'event-description'}).get_text(" ").strip()
                description = re.sub(' +', ' ', description)
            # Price, first one
            price = None
            if parsed_html.find('form', attrs={"id":'ticket-form'}) != None:
                price_aux = parsed_html.find('form', attrs={"id":'ticket-form'})
                if len(price_aux.find_all('span')) >= 3:
                    price = price_aux.find_all('span')[2].text.strip()
            # Other info
            info = None
            if parsed_html.find('td', attrs={'class':'opt-panel'}) != None:
                info = parsed_html.find('td', attrs={'class':'opt-panel'}).text.strip()
            # Labels
            # NA
    
            event_data = dict()
            event_data['name'] = name
            event_data['date'] = date
            event_data['location'] = location
            event_data['price'] = price
            event_data['info'] = info
            event_data['description'] = description
            event_data['organizer'] = organizer
            event_data['org_desc'] = org_desc

            output.write("{}\n".format(json.dumps(event_data, ensure_ascii=False)))

100%|██████████| 889/889 [01:32<00:00,  9.64it/s]


### Loading collected information for processing

In [712]:
week_map = {"dom": 0, "sun": 0,
               "seg": 1, "mon": 1,
               "ter": 2, "tue": 2, 
               "qua": 3, "wed": 3, 
               "qui": 4, "thu": 4, "quinta": 4,
               "sex": 5, "fri": 5, 
               "sab": 6, "sat": 6, "sabado": 6}

In [780]:
month_map = {"janeiro": "Jan",
              "fevereiro": "Feb",
              "marco": "Mar", 
              "abril": "Apr", 
              "maio": "May", 
              "junho": "Jun",
              "julho": "Jul",
              "agosto": "Aug", 
              "setembro": "Sep", 
              "outubro": "Oct", 
              "novembro": "Nov",
              "dezembro": "Dec"}

In [829]:
events_info_sympla_raw = pd.read_json("./parsed_events_info_sympla.json", lines=True)
events_info_sympla_raw.dropna(axis=0, subset=['name'], inplace=True)
events_info_sympla_raw = events_info_sympla_raw.reset_index(drop=True)
events_info_sympla_raw.head()

Unnamed: 0,name,date,location,price,info,description,organizer,org_desc
0,01/11 - Culto de Domingo - Manhã,,,Inscrições até 01/11/2020,0,,Igreja Monte Carmelo,Conheça-nos melhor\n\nhttp://monte-carmelo.onl...
1,01/11 - Culto de Domingo - Noite,,,Inscrições até 01/11/2020,0,,Igreja Monte Carmelo,Conheça-nos melhor\n\nhttp://monte-carmelo.onl...
2,05/11- Culto de Quinta,,,Inscrições até 05/11/2020,0,,Igreja Monte Carmelo,
3,10º ESLADJA E 22º ANIVERSARIO DA ADJA-TRABALHA...,"14 de novembro de 2020, 14h30>21h","R. Uhland, 204 - São Paulo, SP","R$ 25,00",0,Em breve acontecerá a nossa 10ª ESLADJA - ESCO...,[email protected],Igreja Evangélica Assembleia de Deus-Ministéri...
4,15º Piquenique Azul,,,Inscrições até 22/11/2020,0,,Nathália Noschese - Natype1,Nathália Noschese é uma jovem que tem Diabetes...


In [903]:
events_info_sympla = pd.DataFrame()

In [891]:
events_info_eb_raw = pd.read_json("./parsed_events_info_eb.json", lines=True)
events_info_eb_raw.dropna(axis=0, subset=['name'], inplace=True)
events_info_eb_raw = events_info_eb_raw.reset_index(drop=True)
events_info_eb = pd.DataFrame()

#### Defining functions to process columns

In [894]:
def remove_whitespace(series):
    return series.str.replace("\n|\t","").str.replace(" +"," ").str.strip()

In [973]:
# Remove accented characters, punctuation and uppercase
def normalize_series(series):
    processed = []
    processed = [normalize('NFKD', value).encode('ASCII','ignore').decode('ASCII') \
           if (value != None) & (value != np.nan) & (type(value) != float) else np.nan for index, value in series.items()]
    return remove_whitespace(pd.Series(processed).str.lower().str.replace(r"[^a-zA-Z0-9]"," ").str.strip())

In [897]:
def remove_numbers(series):
    return series.str.replace(r"[0-9]","")

In [898]:
# Use mean price when there is a range of ticket prices
def determine_price(row):
    if row[1] < row[0]:
        return row[0]
    else:
        return (row[0]+row[1])/2

In [899]:
# Remove specific non-definitional words from all events
def clean_description(series):
    return series.str.replace('(sobre este evento)|(about this event)|(a propos de cet evenement)|(acerca deste evento)|(acerca de este evento)', '')

In [904]:
def join_na(row):
    if type(row[0]) != float:
        return " ".join(row)
    else:
        return np.nan

### Processing columns

I decided to process each column as follows:
- Name: make it lowercase, remove punctuation, accented chars and numbers
- Date: use day of week (DOW) as possible feature
- Location: make it lowercase, remove punctuation, accented chars and numbers
- Price: use mean or lowest price
- Info: make it lowercase, remove punctuation, accented chars and numbers
- Description: make it lowercase, remove punctuation, accented chars
- Organizer: make it lowercase, remove punctuation, accented chars

#### Processing columns - EB

In [900]:
# Name
events_info_eb['name'] = remove_whitespace(remove_numbers(normalize_series(events_info_eb_raw['name'])))

# DOW
events_info_eb_raw.rename(columns={'js-date-time-first-line':'full_date'}, inplace=True)
events_info_eb['dow'] = events_info_eb_raw['full_date'].str.extract('(.*?),', expand=False).str.lower().str.replace('á','a').map(week_map)

# Location
events_info_eb_raw.rename(columns={'event-details__data':'location_raw'}, inplace=True)
location_aux = pd.Series(dtype='object')
location_aux = remove_whitespace(events_info_eb_raw['location_raw'].str.replace('Ver mapa',''))
events_info_eb['location'] = remove_whitespace(remove_numbers(normalize_series(location_aux)))

# Price; if null, event is free
events_info_eb_raw['full_price'] = events_info_eb_raw['price']
events_info_eb_raw.loc[(events_info_eb_raw['full_price'] == '') | (events_info_eb_raw['full_price'] == 'Free') | \
                   (events_info_eb_raw['full_price'] == 'Gratuito') | (events_info_eb_raw['full_price'] == 'Gratuit'),'full_price'] = 'R$0'
# Fetching range of prices
price_aux = events_info_eb_raw['full_price'].str.replace('\.|[A-Z]|\$|\s','').str.replace('\,','.').str.extract(r'(\d+)–*(\d*)')
price_aux[0] = pd.to_numeric(price_aux[0], downcast='float')
price_aux[1] = pd.to_numeric(price_aux[1], downcast='float')
# Lowest price
events_info_eb_raw['min_price'] = price_aux[0]
# Highest price
price_aux.loc[price_aux[1].isna(), 1] = 0
events_info_eb_raw['max_price'] = price_aux[1]
# Feature price
events_info_eb['price'] = pd.Series(price_aux.apply(determine_price, axis=1), dtype='float')

# Info
events_info_eb['info'] = remove_whitespace(remove_numbers(normalize_series(events_info_eb_raw['info'])))

# Description
events_info_eb_raw.rename(columns={'structured-content_g-cell_g-cell-10-12_g-cell-md-1-1':'description_raw'}, inplace=True)
events_info_eb['description'] = remove_whitespace(clean_description(normalize_series(events_info_eb_raw['description_raw'])))

# Organizer
events_info_eb_raw.loc[events_info_eb_raw['organizer'] == '', 'organizer'] = None
org_aux = events_info_eb_raw['organizer'].str.replace('por |by ','')
events_info_eb['organizer'] = remove_whitespace(normalize_series(org_aux))

#### Processing columns - Sympla

In [905]:
# Name
events_info_sympla['name'] = remove_whitespace(remove_numbers(normalize_series(events_info_sympla_raw['name'])))

# DOW
date_aux = normalize_series(events_info_sympla_raw['date'])
date_aux = date_aux.str.extract(r'^(\d+) de ([a-z]+) de (\d+)')
date_aux[1] = date_aux[1].map(month_map)
date_aux = date_aux.apply(join_na, axis=1)
date_aux = pd.to_datetime(date_aux, format="%d %b %Y")
events_info_sympla['dow'] = date_aux.dt.dayofweek

# Location
events_info_sympla['location'] = remove_whitespace(remove_numbers(normalize_series(events_info_sympla_raw['location'])))

# Price; if null, event is free
events_info_sympla_raw.loc[events_info_sympla_raw['price'].isnull(), 'price'] = 'R$ 0'
price_aux = events_info_sympla_raw['price'].str.replace('\.|R\$|\s','').str.replace('\,','.')
price_aux[price_aux.str.contains('[azAZ]')] = 0
events_info_sympla['price'] = pd.Series(price_aux, dtype='float')

# Info
events_info_sympla_raw.loc[events_info_sympla_raw['info'] == '0', 'info'] = float('nan')
events_info_sympla['info'] = remove_whitespace(remove_numbers(normalize_series(events_info_sympla_raw['info'])))

# Description
events_info_sympla['description'] = remove_whitespace(normalize_series(events_info_sympla_raw['description']))

# Organizer
events_info_sympla['organizer'] = remove_whitespace(normalize_series(events_info_sympla_raw['organizer']))

In [906]:
events_info_sympla

Unnamed: 0,name,dow,location,price,info,description,organizer
0,culto de domingo manha,,,0,,,igreja monte carmelo
1,culto de domingo noite,,,0,,,igreja monte carmelo
2,culto de quinta,,,0,,,igreja monte carmelo
3,o esladja e o aniversario da adja trabalhando ...,5.0,r uhland sao paulo sp,25.00,,em breve acontecera a nossa 10a esladja escola...,email protected
4,o piquenique azul,,,0,,,nathalia noschese natype1
...,...,...,...,...,...,...,...
884,xvi jornada paulista de neurofisiologia clinica,,,150.00,,,hospital alemao oswaldo cruz
885,xxviii domingo do tempo comum cor liturgica verde,6.0,paroquia sao sebastiao sao paulo sp,0,encerrado,devido as medidas restritivas as missas presen...,paroquia sao sebastiao
886,ya experience sunrise,5.0,garage coworking sao paulo sp,95.00,,experience sunrise a vida em movimento e o cam...,claudia faria fundadora do yoga adventure
887,yoga outubrorosaflow,1.0,parque ibirapuera estacionamento mam sao paulo sp,0,encerrado,,flow


### Joining data and saving for manual labeling

In [None]:
df = pd.concat([events_info_eb, events_info_sympla]).reset_index(drop=True)
df = df.drop_duplicates()
df.to_csv('./features_before_labeling.csv', encoding='utf-8')

## PREPROCESSING

### Loading data after labeling

In [2]:
# Data was randomly shuffled while labeling in order to mix events from both aggregator websites
df = pd.read_csv("./features_with_labels.csv", index_col=0).reset_index(drop=True)
df = df.drop(['RAND'], axis=1)

### Preparing string columns for vectorization

I applied tokenization, stopword removal and duplicate word removal in order to prepare text for vectorization  
Stemming was not applied as default, but is implemented and can be performed by setting stem=True

In [3]:
def col_text_processing(series, stem=False):
    processed = []
    for index, value in series.items():
        if type(value) != float:
            # Tokenization
            value = word_tokenize(value)

            # Stopword removal
            value = [w for w in value if w not in stop_pt]

            # Duplicate removal
            value = list(unique_everseen(value))

            # Stemming
            if stem:
                stemmer = RSLPStemmer()
                value = [stemmer.stem(w) for w in value]

            value = " ".join(value)
        
        processed.append(value)
    
    return pd.Series(processed, name=series.name)

def df_text_processing(df, cols, stem=False):
    aux_df = pd.DataFrame()
    aux_df = df[df.columns.difference(cols)]
    
    for c in cols:
        aux_df = pd.concat([aux_df, col_text_processing(df[c], stem)], axis=1)
    
    return aux_df

In [4]:
# Info column not selected as feature for having too few non-null values and not so relevant information
text_cols = ['name', 'location', 'description', 'organizer']
# Diff as in different from textual
diff_cols = df.columns.difference(text_cols)
diff_cols = diff_cols.drop(['info', 'y'])

# Remove rows with null values on the text_cols
df.dropna(axis=0, subset=text_cols, how='any', inplace=True)
df.reset_index(drop=True, inplace=True)

# Process text_cols
df = df_text_processing(df, text_cols)

# Saving labels
y = df['y'].copy()
df.drop('y', axis=1, inplace=True)

### Splitting data - train & valid

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size = 0.35, random_state=0)
print("y train: {}, y valid: {}".format(y_train.sum(), y_valid.sum()))

X_train_diff = X_train[diff_cols].copy()
X_valid_diff = X_valid[diff_cols].copy()
X_train_text = X_train[text_cols].copy()
X_valid_text = X_valid[text_cols].copy()

y train: 34, y valid: 18


In [133]:
X_train_diff.head()

Unnamed: 0,dow,price
37,3.0,0.0
143,6.0,40.0
436,0.0,487.5
158,5.0,15.0
613,,0.0


In [134]:
X_train_text.head()

Unnamed: 0,name,location,description,organizer
37,exposicao luzes memoria,iac instituto arte contemporanea sao paulo sp,exposicao luzes memoria curadoria marilucia bo...,instituto arte contemporanea
143,pool candy,local definir sao paulo sp,1 edicao pool candy line up breve hostess yasm...,alex
436,c i m congresso internacional academico microp...,organizador nao aceita reembolsos art lei codi...,ciami congresso internacional academico microp...,renata constante barcelli
158,comedia solta,acustico business sao paulo sp,espetaculo stand up comedia solta iniciativa f...,acustico business
613,curso energia solar hortolandia,hotel horto plaza hortolandia sp rua zacarias ...,curso energia solar hortolandia corte ate 100 ...,evento solar fotovoltaico


### Feature engineering - numerical

#### Scaling price values with StandardScaler

In [6]:
# Fit_transform on training data
price_scaled_df = pd.DataFrame(X_train_diff['price'])
price_scaled_df.rename(columns={'price':'price_scaled'}, inplace=True)
price_array = X_train_diff['price'].to_numpy()

scaler = StandardScaler()
price_scaled_array = scaler.fit_transform(price_array.reshape(-1,1))
price_scaled_df = pd.DataFrame(price_scaled_array, columns=price_scaled_df.columns, index=price_scaled_df.index)

X_train_diff.drop(['price'], axis=1, inplace=True)
X_train_diff = pd.concat([X_train_diff, price_scaled_df], axis=1)

In [7]:
# Transform on validation data
price_scaled_df = pd.DataFrame(X_valid_diff['price'])
price_scaled_df.rename(columns={'price':'price_scaled'}, inplace=True)
price_array = X_valid_diff['price'].to_numpy()

price_scaled_array = scaler.transform(price_array.reshape(-1,1))
price_scaled_df = pd.DataFrame(price_scaled_array, columns=price_scaled_df.columns, index=price_scaled_df.index)

X_valid_diff.drop(['price'], axis=1, inplace=True)
X_valid_diff = pd.concat([X_valid_diff, price_scaled_df], axis=1)
X_valid_diff

Unnamed: 0,dow,price_scaled
915,5.0,0.927592
590,6.0,-0.266268
108,,0.042945
833,0.0,1.977593
1067,4.0,2.333106
...,...,...
870,0.0,-0.477921
352,3.0,-0.087090
95,5.0,-0.477921
113,6.0,-0.477921


### Feature engineering - categorical & NLP

#### Imputing mode for null DOW values

In [8]:
# Obtaining mode from training data only
mode_dow = X_train_diff['dow'].mode()
X_train_diff['dow'].fillna(mode_dow[0], inplace=True)
X_valid_diff['dow'].fillna(mode_dow[0], inplace=True)

#### Transforming DOW column to dummy variables with OneHotEncoder

In [9]:
# Training data
dow_array = X_train_diff['dow'].to_numpy()

dow_encoder = OneHotEncoder()
dow_1hot = dow_encoder.fit_transform(dow_array.reshape(-1,1))

X_train_diff = pd.concat([X_train_diff, pd.DataFrame(dow_1hot.toarray(), index=X_train_diff.index)], axis=1)
X_train_diff.drop(['dow'], axis=1, inplace=True)

In [10]:
# Validation data 
# As this is a OneHotEncoder, it wouldn't be necessary to do this this way, but it doesn't hurt to be consistent with the appropriate
# treatment of training and validation data
dow_array = X_valid_diff['dow'].to_numpy()

dow_1hot = dow_encoder.transform(dow_array.reshape(-1,1))

X_valid_diff = pd.concat([X_valid_diff, pd.DataFrame(dow_1hot.toarray(), index=X_valid_diff.index)], axis=1)
X_valid_diff.drop(['dow'], axis=1, inplace=True)

In [224]:
X_valid_diff

Unnamed: 0,price_scaled,0,1,2,3,4,5,6
915,0.927592,0.0,0.0,0.0,0.0,0.0,1.0,0.0
590,-0.266268,0.0,0.0,0.0,0.0,0.0,0.0,1.0
108,0.042945,0.0,0.0,0.0,0.0,0.0,1.0,0.0
833,1.977593,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1067,2.333106,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
870,-0.477921,1.0,0.0,0.0,0.0,0.0,0.0,0.0
352,-0.087090,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95,-0.477921,0.0,0.0,0.0,0.0,0.0,1.0,0.0
113,-0.477921,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Defining function for fitting and transforming textual data (and also saving vectorizers for production purposes)

In [11]:
def fit_transform_text(col_train, col_valid, min_df=1, ngram_range=(1,1), save=False):
    
    vect = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    text_features_train = vect.fit_transform(col_train)
    #print("# features for {}: {}".format(col_train.name, text_features_train.shape[1]))
    text_features_valid = vect.transform(col_valid)
    
    if save:
        jb.dump(vect, "./Deploy/vect_{}.pkl.z".format(col_train.name))
        
    return text_features_train, text_features_valid

## MODELING

In [12]:
# Defining default hyperparameters for some models

# Random Forest (RF) default
rf_default = dict()
rf_default['n_estimators'], rf_default['max_depth'], rf_default['min_samples_split'], rf_default['min_samples_leaf'] = 100, None, 2, 1
rf_default['max_features'], rf_default['bootstrap'] = 'auto', True

# Light GBM (LGBM) default
lgbm_default = dict()
lgbm_default['n_estimators'], lgbm_default['max_depth'], lgbm_default['min_child_samples'], lgbm_default['subsample'] = 100, -1, 20, 1
lgbm_default['colsample_bytree'], lgbm_default['learning_rate'] = 1, 0.1

In [13]:
# Defining optimal hyperparameters for some models (obtained after running optimization functions below)

# RF parameters
rf_params = dict()
rf_params['n_estimators'], rf_params['max_depth'], rf_params['min_samples_split'], rf_params['min_samples_leaf'] = 763, 5, 20, 10
rf_params['max_features'], rf_params['bootstrap'] = 'sqrt', False
class_weight_rf = 'balanced'
min_df_rf = 2
ngram_range_rf = (1,1)
diff_feats_rf = False

# LGBM parameters
lgbm_params = dict()
lgbm_params['n_estimators'], lgbm_params['max_depth'], lgbm_params['min_child_samples'], lgbm_params['subsample'] = 650, 6, 11, 0.11295588601409831
lgbm_params['colsample_bytree'], lgbm_params['learning_rate'] = 0.09968988985022398, 0.0018192116907497335
class_weight_lgbm = 'balanced'
min_df_lgbm = 4
ngram_range_lgbm = (1,5)
diff_feats_lgbm = False

### Defining functions for training and evaluating models

In [15]:
def define_model(m_type, name, save=False, rf_params=rf_default, lgbm_params=lgbm_default, \
                 diff_feats=True, class_weight='balanced', min_df=1, ngram_range=(1,1)):
    m = dict()
    m['m_type'], m['name'], m['diff_feats'], m['class_weight'] = m_type, name, diff_feats, class_weight
    m['min_df'], m['ngram_range'], m['rf_params'], m['lgbm_params'], m['save'] = min_df, ngram_range, rf_params, lgbm_params, save
    return m

In [21]:
def train_evaluate_model(m_type, name, save, rf_params=rf_default, lgbm_params=lgbm_default, \
                         diff_feats=True, class_weight='balanced', min_df=1, ngram_range=(1,1)):
    
    models = {
        'lr': LogisticRegression(C=0.8, penalty='l2', n_jobs=6, random_state=0),
        'tree': DecisionTreeClassifier(class_weight=class_weight, random_state=0),
        'rf': RandomForestClassifier(n_estimators=rf_params['n_estimators'], max_depth=rf_params['max_depth'], \
                                     min_samples_split=rf_params['min_samples_split'], min_samples_leaf=rf_params['min_samples_leaf'], \
                                     max_features=rf_params['max_features'], bootstrap=rf_params['bootstrap'], class_weight=class_weight, \
                                     n_jobs=6, random_state=0),
        'lgbm': LGBMClassifier(n_estimators=lgbm_params['n_estimators'], max_depth=lgbm_params['max_depth'], \
                                     min_child_samples=lgbm_params['min_child_samples'], subsample=lgbm_params['subsample'], \
                                     colsample_bytree=lgbm_params['colsample_bytree'], learning_rate=lgbm_params['learning_rate'], \
                                     class_weight=class_weight, n_jobs=6, random_state=0)
    }  
    
    X_train = pd.DataFrame()
    X_valid = pd.DataFrame()
    
    for c in text_cols:
        text_features_train, text_features_valid = fit_transform_text(X_train_text[c], X_valid_text[c], min_df=min_df, ngram_range=ngram_range, \
                                                                      save=save)
        X_train = hstack([X_train, text_features_train])
        X_valid = hstack([X_valid, text_features_valid])
    
    if diff_feats:
        X_train = hstack([X_train_diff, X_train])
        X_valid = hstack([X_valid_diff, X_valid])
    
    model = models.get(m_type, '')
    
    model.fit(X_train, y_train)
    # Train data
    prob_train = model.predict_proba(X_train)[:,1]
    y_pred_train = model.predict(X_train)
    
    # Valid data
    prob_valid = model.predict_proba(X_valid)[:,1]
    y_pred_valid = model.predict(X_valid)
    
    evaluation_dict = dict()
    evaluation_dict['model_type'] = m_type
    evaluation_dict['name'] = name
    evaluation_dict['train_miscl_error'] = 1-accuracy_score(y_train, y_pred_train)
    evaluation_dict['valid_miscl_error'] = 1-accuracy_score(y_valid, y_pred_valid)
    evaluation_dict['average_precision'] = average_precision_score(y_valid, prob_valid)
    evaluation_dict['auc'] = roc_auc_score(y_valid, prob_valid)
    evaluation_dict['accuracy'] = accuracy_score(y_valid, y_pred_valid)
    evaluation_dict['precision'] = precision_score(y_valid, y_pred_valid)
    evaluation_dict['recall'] = recall_score(y_valid, y_pred_valid)
    evaluation_dict['f1_score'] = f1_score(y_valid, y_pred_valid)
    
    
    return model, evaluation_dict

In [23]:
def compare_models(eval_dict_list):
    
    df = pd.DataFrame(eval_dict_list)
    
    # idxmax: returns index of row with max column value
    m_max_ap = df.iloc[df['average_precision'].idxmax()]['name']
    m_max_auc = df.iloc[df['auc'].idxmax()]['name']
    m_max_accuracy = df.iloc[df['accuracy'].idxmax()]['name']
    m_max_precision = df.iloc[df['precision'].idxmax()]['name']
    m_max_recall = df.iloc[df['recall'].idxmax()]['name']
    m_max_f1 = df.iloc[df['f1_score'].idxmax()]['name']
    
    print('VALIDATION DATA RESULTS')
    print('Highest AP: {}\nHighest AUC: {}\nHighest ACCURACY: {}\nHighest PRECISION: {}\nHighest RECALL: {}\nHighest F1: {}\n'.format(m_max_ap, m_max_auc, m_max_accuracy, m_max_precision, m_max_recall, m_max_f1))
    
    return df

In [24]:
# Define models
lr_dict = define_model(m_type='lr', name='lr', min_df=1, ngram_range=(1,1))

tree_dict = define_model(m_type='tree', name='tree', class_weight=None, min_df=1, ngram_range=(1,1))
tree_balanced_dict = define_model(m_type='tree', name='tree_balanced', class_weight='balanced', min_df=1, ngram_range=(1,1))
tree_min_df_dict = define_model(m_type='tree', name='tree_min_df', class_weight=None, min_df=2, ngram_range=(1,1))
tree__min_df_balanced_dict = define_model(m_type='tree', name='tree_min_df_balanced', class_weight='balanced', min_df=2, ngram_range=(1,1))

rf_default_dict = define_model(m_type='rf', name='rf_default', rf_params=rf_default, class_weight='balanced', min_df=1, ngram_range=(1,1))
rf_params_dict = define_model(m_type='rf', name='rf_params', save=True, rf_params=rf_params, diff_feats=diff_feats_rf, class_weight=class_weight_rf, \
                              min_df=min_df_rf, ngram_range=ngram_range_rf)

lgbm_default_dict = define_model(m_type='lgbm', name='lgbm_default', lgbm_params=lgbm_default, class_weight='balanced', min_df=1, ngram_range=(1,1))
lgbm_params_dict = define_model(m_type='lgbm', name='lgbm_params', lgbm_params=lgbm_params, diff_feats=diff_feats_lgbm, class_weight=class_weight_lgbm, \
                                min_df=min_df_lgbm, ngram_range=ngram_range_lgbm)

In [None]:
model_spec_list = [lr_dict, tree_dict, tree_balanced_dict, tree_min_df_dict, tree__min_df_balanced_dict, \
                   rf_default_dict, rf_params_dict, lgbm_default_dict, lgbm_params_dict]

model_list = []
eval_dict_list = []

for m in model_spec_list:
    model, evaluation_dict = train_evaluate_model(m['m_type'], m['name'], save=m['save'], rf_params=m['rf_params'], lgbm_params=m['lgbm_params'], \
                            diff_feats=m['diff_feats'], class_weight=m['class_weight'], min_df=m['min_df'], ngram_range=m['ngram_range'])
    model_list.append(model)
    eval_dict_list.append(evaluation_dict)

In [26]:
df_eval = compare_models(eval_dict_list)
df_eval

VALIDATION DATA RESULTS
Highest AP: lr
Highest AUC: lr
Highest ACCURACY: rf_params
Highest PRECISION: lr
Highest RECALL: tree_min_df
Highest F1: rf_params



Unnamed: 0,model_type,name,train_miscl_error,valid_miscl_error,average_precision,auc,accuracy,precision,recall,f1_score
0,lr,lr,0.03183,0.034398,0.600742,0.874607,0.965602,1.0,0.222222,0.363636
1,tree,tree,0.0,0.054054,0.251138,0.759783,0.945946,0.416667,0.555556,0.47619
2,tree,tree_balanced,0.0,0.07371,0.151554,0.696515,0.92629,0.285714,0.444444,0.347826
3,tree,tree_min_df,0.0,0.044226,0.322755,0.791417,0.955774,0.5,0.611111,0.55
4,tree,tree_min_df_balanced,0.0,0.076167,0.12785,0.668738,0.923833,0.259259,0.388889,0.311111
5,rf,rf_default,0.0,0.029484,0.566121,0.852614,0.970516,0.875,0.388889,0.538462
6,rf,rf_params,0.009284,0.022113,0.584657,0.842616,0.977887,0.909091,0.555556,0.689655
7,lgbm,lgbm_default,0.0,0.029484,0.514616,0.78599,0.970516,0.875,0.388889,0.538462
8,lgbm,lgbm_params,0.003979,0.02457,0.488398,0.802985,0.97543,0.9,0.5,0.642857


### Chosen model: rf_params (optimized random forest)

- Not overfitting training data: > 0 train misclassification error (ME)   
- Highest accuracy with validation data (lowest validation ME)  
- Highest F1-score with validation data
- Second highest recall with validation data

In [234]:
model = model_list[6]

### Save chosen model

TfIdf Vectorizers for chosen model were saved by running define_model with save=True

In [None]:
jb.dump(model, "./Deploy/model.pkl.z")

### Optimization functions: optimizing for highest F1-Score

In [200]:
# RF Optimization
def optimize_rf(params):
    
    print(params)
    n_estimators = params[0]
    max_depth = params[1]
    min_samples_split = params[2]
    min_samples_leaf = params[3]
    max_features = params[4] 
    bootstrap = params[5]
    class_weight = params[6]
    min_df = params[7] 
    ngram_range = (1, params[8])
    diff_feats = params[9]
    
    X_train = pd.DataFrame()
    X_valid = pd.DataFrame()
    
    for c in text_cols:
        text_features_train, text_features_valid = fit_transform_text(X_train_text[c], X_valid_text[c], min_df=min_df, ngram_range=ngram_range)
        X_train = hstack([X_train, text_features_train])
        X_valid = hstack([X_valid, text_features_valid])
    
    if diff_feats:
        X_train = hstack([X_train_diff, X_train])
        X_valid = hstack([X_valid_diff, X_valid])
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, \
                                   min_samples_leaf=min_samples_leaf, max_features=max_features, bootstrap=bootstrap, class_weight=class_weight, \
                                   n_jobs=6, random_state=0, )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    print("Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(round(accuracy_score(y_valid, y_pred), 4), \
                round(precision_score(y_valid, y_pred), 4), round(recall_score(y_valid, y_pred), 4), round(f1_score(y_valid, y_pred), 4)))
    
    return -f1_score(y_valid, y_pred) 

In [None]:
space_rf = [(100,1000), # n_estimators
          (1, 10), # max_depth
          (2, 20), # min_samples_split
          (1, 10), # min_samples_leaf
          ('sqrt', 'log2', None), # max_features
          (True, False), # bootstrap
          ('balanced', None), # class_weight
          (1,5), # min_df
          (1,5), # ngram_range
          (True, False)] # diff_feats

# Sequential optimization using decision trees
opt_rf = forest_minimize(optimize_rf, space_rf, n_random_starts=20, n_calls=60, verbose=1, random_state=0)

In [202]:
opt_rf.x, opt_rf.fun

([763, 5, 20, 10, 'sqrt', False, 'balanced', 2, 1, False], -0.6896551724137931)

In [203]:
# LGBM Optimization
def optimize_lgbm(params):
    
    print(params)
    n_estimators = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4] 
    learning_rate = params[5]
    min_df = params[6] 
    ngram_range = (1, params[7])
    diff_feats = params[8]
    
    X_train = pd.DataFrame()
    X_valid = pd.DataFrame()
    
    for c in text_cols:
        text_features_train, text_features_valid = fit_transform_text(X_train_text[c], X_valid_text[c], min_df=min_df, ngram_range=ngram_range)
        X_train = hstack([X_train, text_features_train])
        X_valid = hstack([X_valid, text_features_valid])
    
    if diff_feats:
        X_train = hstack([X_train_diff, X_train])
        X_valid = hstack([X_valid_diff, X_valid])
    
    model = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, min_child_samples=min_child_samples, \
                                     subsample=subsample, colsample_bytree=colsample_bytree, learning_rate=learning_rate, \
                                     class_weight='balanced', num_leaves=2**max_depth, n_jobs=6, random_state=0)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    print("Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}".format(round(accuracy_score(y_valid, y_pred), 4), \
                round(precision_score(y_valid, y_pred), 4), round(recall_score(y_valid, y_pred), 4), round(f1_score(y_valid, y_pred), 4)))
    
    return -f1_score(y_valid, y_pred)   

In [None]:
space_lgbm = [(100,1000), # n_estimators,
         (1, 10), # max_depth
         (1, 20), # min_child_samples
         (0.05, 1.), # subsample
         (0.05, 1.), # colsample_bytree
         (1e-3, 1e-1, 'log-uniform'), # learning_rate
         (1,5), # min_df
         (1,5), # ngram_range
         (True,False)] # diff_feats

# Sequential optimization using decision trees
opt_lgbm = forest_minimize(optimize_lgbm, space_lgbm, n_random_starts=20, n_calls=60, verbose=1, random_state=0)

In [205]:
opt_lgbm.x, opt_lgbm.fun

([650,
  6,
  11,
  0.11295588601409831,
  0.09968988985022398,
  0.0018192116907497335,
  4,
  5,
  False],
 -0.6428571428571429)