# Classify Industries with HTML information

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# sklearn classification
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

# sklearn general
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import (confusion_matrix, 
                             classification_report, 
                             f1_score, 
                             precision_score,
                             recall_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


from stop_words import get_stop_words
import ujson as json


import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from app.utils import (clean_boilerplate, 
                       clean_string,
                       clean_website, 
                       detect_XML, 
                       extract_tagtexts,
                       extract_tree,
                       extract_meta_informations,
                       reduce_whitespace,
                       remove_special_characters,
                       remove_tags)

Loading of Sequence Tagger Model failed!


In [2]:
%%time
train = pd.read_csv("../data/ctrain.csv", lineterminator="\n", nrows=100).fillna("")
#train = train.fillna("")
train.head(1)

CPU times: user 33.2 ms, sys: 7.69 ms, total: 40.9 ms
Wall time: 44.4 ms


Unnamed: 0,url,group_representative,group_representative_label,text,chtml,<meta>_title,<meta>_keywords,<meta>_description,<title>,<h1>,...,<h5>,<h6>,<b>,<strong>,<em>,<i>,<p>,<a>,<li>,country
0,http://12-18.com,30,"Leisure, Travel & Tourism",\n\nInvestment Management.\n\nEin glückliches ...,<html>\n<head>\n\t<title>12.18. Investment Man...,12.18. Investment Management - ANDERS. AUS PRI...,"12 18, 1218 Investment, 12 18 Invest, lim, lin...",Ein glückliches Investment ist das Resultat ha...,12.18. Investment Management - ANDERS. AUS PRI...,,...,Fleesensee Resort | Hotel Stadt HamburgAlte...,,,,,Wir nehmen den Datenschutz nach EU-DSGVO erns...,12.18. Investment Management GmbHKönigsallee ...,"Aktuelles Fleesensee Resort Kontakt Studie ""F...",Aktuelles Investment Management Hotel Collecti...,DE


In [3]:
train.shape

(100, 23)

In [12]:
def cleans(string):
    string = string.replace("\n", "")
    string = string.replace("\r", "")
    string = string.replace("\t", "")
    return string

In [20]:
chtml = train.iloc[21].chtml

In [17]:
for idx, row in train.iterrows():
    allt = extract_tagtexts(row.chtml, "p", no_inner=False)
    innert = extract_tagtexts(row.chtml, "p", no_inner=True)
    if len(cleans(allt)) - len(cleans(innert)) < 0:
        print(idx, len(cleans(allt)) - len(cleans(innert)))

0 -5
2 -14
7 -274
9 -2
11 -193
21 -1909
31 -766
37 -3273
38 -32
39 -146
43 -247
45 -84
48 -53
51 -4
61 -99
62 -328
63 -366
65 -2306
75 -905
82 -2
97 -72


In [5]:
from lxml.html import soupparser
c = "<xml><div>hallo<div>wie gehts</div></div></xml>"
extract_tagtexts(c, "div", no_inner=True)

'hallo wie gehts'

In [49]:
from lxml import html, etree

etree.tostring(extract_tree(chtml, "xml"), method="text")

UnicodeEncodeError: 'ascii' codec can't encode character '\u0308' in position 228: ordinal not in range(128)

In [62]:
divs = extract_tagtexts(c, "div")
cleans(divs)

'wie gehts hallowie gehts'

In [29]:
len(cleans(divs))

226955

In [41]:
cleans(train.iloc[0].text)

'Investment Management.Ein glückliches Investment ist das Resultat harter Arbeit und eherner Prinzipien. Wir von 12.18. Investment Management bieten unseren Kunden beides – ganz gleich, ob Sie Investor, Verkäufer oder Käufer einer Immobilie sind.Investment Management.Hospitality Management.Die 12.18. Hospitality Management GmbH dient der Sicherstellung des strategischen Erfolges der unternehmenseigenen Hotels durch Positionierung, Kommunikation, Verkauf und Nachhaltigkeit.Hospitality Management.Private Investments.Unser Sale-and-Lease-Back-Konzept mit 12.18.-Standard kombiniert die Vorteile einer attraktiven Kapitalanlage zur Vermögenssicherung für Generationen mit den Vorzügen der eigenen Ferienimmobilie.Private Investments.Hotel Collection.Wir sind stolz auf das, was wir bisher geleistet haben. Und es erfüllt uns mit Neugier, welche spannenden Herausforderungen die Zukunft für uns bereithält. Willkommen in unserer Hotelkollektion.Hotel Collection.Wer wir sind und was wir tun.INVESTME

In [39]:
import lxml.html
document = lxml.html.document_fromstring(chtml)
# internally does: etree.XPath("string()")(document)
cleans(document.text_content())

'12.18. Investment Management - ANDERS. AUS PRINZIP.        Investment Management.Ein glückliches Investment ist das Resultat harter Arbeit und eherner Prinzipien. Wir von 12.18. Investment Management bieten unseren Kunden beides – ganz gleich, ob Sie Investor, Verkäufer oder Käufer einer Immobilie sind.Investment Management.Hospitality Management.Die 12.18. Hospitality Management GmbH dient der Sicherstellung des strategischen Erfolges der unternehmenseigenen Hotels durch Positionierung, Kommunikation, Verkauf und Nachhaltigkeit.Hospitality Management.Private Investments.Unser Sale-and-Lease-Back-Konzept mit 12.18.-Standard kombiniert die Vorteile einer attraktiven Kapitalanlage zur Vermögenssicherung für Generationen mit den Vorzügen der eigenen Ferienimmobilie.Private Investments.Hotel Collection.Wir sind stolz auf das, was wir bisher geleistet haben. Und es erfüllt uns mit Neugier, welche spannenden Herausforderungen die Zukunft für uns bereithält. Willkommen in unserer Hotelkollek

In [30]:
d = train[train["<di>"] != ""]
d.head(1)

Unnamed: 0,url,group_representative,group_representative_label,text,chtml,<meta>_title,<meta>_keywords,<meta>_description,<title>,<h1>,...,<h5>,<h6>,<b>,<strong>,<em>,<i>,<p>,<a>,<li>,country
0,http://12-18.com,30,"Leisure, Travel & Tourism",\n\nInvestment Management.\n\nEin glückliches ...,<html>\n<head>\n\t<title>12.18. Investment Man...,12.18. Investment Management - ANDERS. AUS PRI...,"12 18, 1218 Investment, 12 18 Invest, lim, lin...",Ein glückliches Investment ist das Resultat ha...,12.18. Investment Management - ANDERS. AUS PRI...,,...,Biggesee Investment Management | Herr John ...,,,,,Wir nehmen den Datenschutz nach EU-DSGVO erns...,12.18. Investment Management GmbHKönigsallee ...,Projekte Biggesee karriere Investment Managem...,Aktuelles Projekte Home Hospitality Management...,DE


In [31]:
len(d["<a>"].value_counts())

29225

In [26]:
train.columns

Index(['url', 'group_representative', 'group_representative_label', 'text',
       'chtml', '<meta>_title', '<meta>_keywords', '<meta>_description',
       '<title>', '<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>', '<b>',
       '<strong>', '<em>', '<i>', '<p>', '<a>', '<li>', 'country'],
      dtype='object')

In [31]:
train = pd.read_csv("../data/ctrain.csv", lineterminator='\n').fillna("")
train.head(1)

Unnamed: 0,url,group_representative,group_representative_label,text,chtml,<meta>_title,<meta>_keywords,<meta>_description,<title>,<h1>,...,<h5>,<h6>,<b>,<strong>,<em>,<i>,<p>,<a>,<li>,country
0,http://12-18.com,30,"Leisure, Travel & Tourism",\n\nInvestment Management.\n\nEin glückliches ...,<html>\n<head>\n\t<title>12.18. Investment Man...,12.18. Investment Management - ANDERS. AUS PRI...,"12 18, 1218 Investment, 12 18 Invest, lim, lin...",Ein glückliches Investment ist das Resultat ha...,12.18. Investment Management - ANDERS. AUS PRI...,,...,Fleesensee Resort | Hotel Stadt HamburgAlte...,,,,,Wir nehmen den Datenschutz nach EU-DSGVO erns...,12.18. Investment Management GmbHKönigsallee ...,"Aktuelles Fleesensee Resort Kontakt Studie ""F...",Aktuelles Investment Management Hotel Collecti...,DE


In [6]:
train.columns

Index(['url', 'group_representative', 'group_representative_label', 'text',
       'chtml', '<meta>_title', '<meta>_keywords', '<meta>_description',
       '<title>', '<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>', '<b>',
       '<strong>', '<em>', '<i>', '<p>', '<a>', '<li>', 'country'],
      dtype='object')

In [33]:
train2 = train.head()

In [46]:
!pip install flair --upgrade



In [28]:
from itertools import combinations
import itertools

d1 = {
        "plain_text": 1,
        "a_text": 1,
        "b_text": 1,
        "em_text": 1,
        "h1_text": 1,
        "h2_text": 1,
        "h3_text": 1,
        "h4_text": 1,
        "h5_text": 1,
        "h6_text": 1,
        "i_text": 1,
        "li_text": 1,
        "meta_description_text": 1,
        "meta_keywords_text": 1,
        "meta_title_text": 1,
        "p_text": 1,
        "strong_text": 1,
        "title_text": 1,
    }

all_combinations = []
for r in range(2, len(d1.keys()) + 1):

    combinations_object = itertools.combinations(d1.keys(), r)
    combinations_list = list(combinations_object)
    all_combinations += combinations_list

len(all_combinations)

262125

In [41]:
%%time

TEXT_COL = "text"
CLASS_COL = "group_representative"

train_text = train[TEXT_COL] + train["meta"]
train_labels = train[CLASS_COL].values

vectorizer = CountVectorizer(max_df=MAX_DOCUMENT_FREQUENCY,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             ngram_range=NGRAM_RANGE,
                             stop_words=STOP_WORDS,
                            tokenizer=tokenizing_html)
transformer = TfidfTransformer()

vector = vectorizer.fit_transform(train_text)
train_vector = transformer.fit_transform(vector)


test = pd.read_csv(TEST_PATH_CSV)
    
test_vector = vectorizer.transform(test[TEXT_COL].values)
test_vector = transformer.transform(test_vector)
test_labels = test[CLASS_COL].values


print("LSVM CLF", "\n-------------------------")
# training
clf = LinearSVC()
clf.fit(train_vector, train_labels)

# prediction
train_preds = clf.predict(test_vector)

# evaluation
precision = precision_score(test_labels, train_preds, average="macro", zero_division=0)
recall = recall_score(test_labels, train_preds, average="macro", zero_division=0)
f1 = f1_score(test_labels, train_preds, average="macro", zero_division=0)
clf2_f1 = np.round(f1, decimals=4)
clf2_precision = np.round(precision, decimals=4)

print(np.round(precision, decimals=4), "\tPrecision")
print(np.round(recall, decimals=4), "\tRecall")
print(np.round(f1, decimals=4), "\tF1")
print()

clf2_report = classification_report(test_labels, 
                                   train_preds, 
                                   target_names = np.unique(test[CLASS_NAMES]),
                                   zero_division = 0)

LSVM CLF 
-------------------------
0.6048 	Precision
0.358 	Recall
0.4111 	F1

CPU times: user 2.72 s, sys: 28.9 ms, total: 2.75 s
Wall time: 2.75 s


In [40]:
train.meta.iloc[1]

'Für Unternehmen, Agenturen und Entwickler » Online Marketing Beratung & Optimierung: SEO, Social-Media, Online-Werbung, Webentwicklung ➥ Alle Infos hier!'

In [39]:
train_text[1]

'STENLE ⇗ Online Marketing für Unternehmen und Agenturen\n\nHome\nWeb Analytics\n\nBran\xadchen\xadum\xadfeld – Analyse\nSEO Ran\xadking\xadana\xadlyse\nOffpage-SEO Analyse\nWett\xadbe\xadwerber Analyse\n\n\nOnline Marketing\nWeb-Entwicklung\nRund um SEO\nSTENLE GmbH\nKontakt\n\n\nOnline-Mar\xadke\xadting\nOptimierung\n\nOnline Mar\xadke\xadting Beratung\n\nOnline\xadmarkt-For\xadschung\n\nWeb-Ent\xadwick\xadlungen\n\nHilfe bei\n\nKun\xadden\xadzu\xadfie\xadden\xadheit\n\nWarum mit STENLE?\n\nHilfe bei\n\nKun\xadden\xadzu\xadfie\xadden\xadheit\n\nWarum mit STENLE?\n\nKun\xadden\xadser\xadvice\n\n★★★★★ 4,82 von 5\n\nSorgfalt\n\n★★★★★ 5,00 von 5\n\nZusatz\xadleis\xadtungen\n\n★★★★★ 5,00 von 5\n\nFle\xadxi\xadbi\xadlität\n\n★★★★★ 5,00 von 5\n\nErrei\xadchen Sie Ihre Ziele mit  STENLE\n\nSebas\xadtian Dietz\n\nImmer wieder sprechen Inter\xades\xadsenten in unseren Erst\xadge\xadsprä\xadchen von einem Gefühl der Ori\xaden\xadtie\xadrungs\xadlo\xadsig\xadkeit bei der Auswahl des rich\xadtige

- plain: 0.3994 F1
- html: 0.2784 	F1
- plain + meta: 0.4111 	F1
- plain + 2 * meta: 0.4063 	F1
- html + meta: 0.2784 	F1

In [None]:
try:
        if markup_type in ["html", "xml"]:
            tree = extract_tree(string, markup_type)
            select = CSSSelector(tags, translator=markup_type)
        else:
            try:
                print("klappt")
                tree = extract_tree(string, "html")
                select = CSSSelector(tags, translator=markup_type)
            except:
                tree = extract_tree(string, "xml")
                select = CSSSelector(tags, translator=markup_type)
                
            
            
        results = [element.get('content') for element in select(tree)]
        results = [x for x in results if x is not None]
        return " ".join(list(set(results)))
    except:
        return [""]

In [2]:
import re
from typing import Dict, List, Optional, Tuple, Union
from unicodedata import normalize

import lxml
from lxml.cssselect import CSSSelector
from lxml.html.clean import Cleaner
from lxml import html, etree
import numpy as np
import pandas as pd

In [70]:
def extract_tree(string: str, markup_type: str) -> lxml.etree._Element:
    """ Extracts tree from string.

    Parameters
    ----------
    string : str
        String which contains the HTML, XHTML or XML.
    markup_type : str, default=None
        Indicate the markup type ('xml' or another).

    Returns
    -------
    tree : lxml.etree._Element
        Extracted lxml.etree Element.
    """
    # XML
    if markup_type == "xml":
        parser = etree.XMLParser(
            encoding="utf-8", ns_clean=True, recover=True, remove_comments=True
        )
        tree = etree.fromstring(string.encode("utf-8"), parser=parser)
    # HTML and XHTML
    else:
        parser = html.HTMLParser(encoding="utf-8")
        tree = html.fromstring(string.encode("utf-8"), parser=parser)
    return tree


def extract_meta_informations(string: str, meta_type: str) -> list:
    """ Extracts meta information from 'title'-, 'keyword'- and description'- 
        meta elements (by choice) and returns the content in a list.
    """
    # title already in text
    if meta_type == "title":
        tags = ['meta[property="og:title"]', 'meta[name="title"]']
    elif meta_type == "keywords":
        tags = ['meta[property="og:keyword"]', 'meta[name="keyword"]']
    elif meta_type == "description":
        tags = ['meta[property="og:description"]', 'meta[name="description"]']
    
    else:
        tags = ['meta[property="og:description"]',
                'meta[name="description"]',
                'meta[property="og:keyword"]',
                'meta[name="keyword"]', 
                'meta[property="og:title"]', 
                'meta[name="title"]']

    tags = ", ".join(tags)

    
    try:      
        tree = extract_tree(string, "html")
        select = CSSSelector(tags, translator="html")
    except:
        tree = extract_tree(string, "xml")
        select = CSSSelector(tags, translator="xml")
        
    results = [element.get('content') for element in select(tree)]
    results = [x for x in results if x is not None]
    return " ".join(list(set(results)))

In [71]:
def detect_XML(string: str) -> str:
    """ Detect XML by XML declaration and returns a markup type string."""
    if string.startswith("<?xml"):
        return "xml"
    else:
        return "html"

In [73]:
import requests

htmlf = requests.get("https://www.uni-wuerzburg.de/startseite/").text
extract_meta_informations(htmlf, "keywords")

hier


''

In [27]:
print(html[:6000])

<!DOCTYPE html>
<html lang="de" class="fontawesome-i2svg-pending">
<head>
  <!-- 0007: production -->
      <meta charset="utf-8">
    <title>Leads & Kontakte automatisch erfassen | snapADDY GmbH</title>
    <base href="https://www.snapaddy.com/">

          <meta name="robots" content="index,follow">
      <meta name="description" content="snapADDY ist eine vertriebsunterstützende Software zur Recherche und Direktübernahme von B2B-Kontakten in CRM- und ERP-Systeme. Jetzt 14 Tage testen!">
      <meta name="generator" content="Contao Open Source CMS">
    
    <!-- Google Verification -->
    <meta name="google-site-verification" content="f1DkKYnmHR6JNaLX3-MexQNTDfXgrQ6p6kUKgLlbaX4">
    <meta name="google-site-verification" content="7o3g9lt0dest7muyrG5zEda_iX8D0DsjKEJQwhzylU8">

    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
                <link rel="alternate" hreflang="de" href="https://www.snapaddy.com/de/">
<link rel="alternate" hrefl

## Tests

- Evaluation metric: **F1 Scores**
- TF-IDF Vectorizer
    - kein lowercase
    - stop words werden entfernt
    - keine max features
- Top $n$ classes = most frequent classes
- CLEAN HTML auch für Test Set (ansonsten unglaublich schlechte Accuracy und etwas sinnlos)


#### Label: `group_representatives`

| Experiment | SGD F1 (Precision) | LSVM F1 (Precision) |
| ---------- |:-----:| ----:|
| HTML (10000 features) | **0.5292** (0.5962) | **0.5493** (0.6371) |
| HTML (kept stop words) (10000 features) | **0.5268** (0.5845) | **0.5473** (0.6439) |
| HTML (10000 features) ((1, 3) ngrams) | **0.4035** (0.463) | **0.4188** (0.5345) |
| HTML (10000 features) ((2, 2) ngrams) | **0.2442** (0.2787) | **0.252** (0.3146) |
| ---------- |-----| ----|
| *ALL LANGS* HTML (kept stop words) (10000 features) | **0.5781** (0.6464) | **0.6406** (0.7024) |
| ---------- |-----| ----|
| Plain Text (kept stop words) (10000 features) (10000 rows) | **0.5841** (0.6301) | **0.5778** (0.6257) |
| Plain Text + Meta (kept stop words) (10000 features) (10000 rows) | **0.5832** (0.6197) | **0.5826** (0.6279) |

## Paths

In [2]:
DATA_DIR_PATH = "../data/"
LANG = ""
ROWS = "_10000"

INDUSTRIES_PATH_CSV = DATA_DIR_PATH + "industries.csv"
TRAIN_PATH_CSV = DATA_DIR_PATH + "train" + LANG + ROWS + ".csv"
TEST_PATH_CSV = DATA_DIR_PATH + "test" + LANG + ROWS +".csv"

## Load train csv

In [3]:
%%time
train = pd.read_csv(TRAIN_PATH_CSV)
train = train.fillna("")

CPU times: user 1.55 s, sys: 421 ms, total: 1.97 s
Wall time: 1.97 s


In [4]:
train.head(1)

Unnamed: 0,url,industry,industry_label,group,group_representative,html,text,source,country,group_representative_label,meta
0,http://www.autarctech.de,144,Renewables & Environment,"gov, man, org",144,<html> <head> ...,Home\n\nMenü\n\n\nShop\nHome\nProdukte\nOur St...,xing,DE,Renewables & Environment,Effizenz bei der Stromspeicherung in Batterien...


In [5]:
train.shape

(8000, 11)

## Hyperparameters

In [6]:
# "text" or "html"
TEXT_COL = "text"

# "group_representative", "group_representative_label", "industry", "industry_label" or "group"
CLASS_COL = "group_representative"
CLASS_NAMES = "group_representative_label"

MAX_DOCUMENT_FREQUENCY = 1.
MAX_FEATURES = 10000
NGRAM_RANGE = (1,1)
LOWERCASE = False
#STOP_WORDS = get_stop_words("de")
STOP_WORDS = None

TAG_LIST = ['a', 'b', 'em', 'h1', 'h2', 'h3', 'i', 'li', 'p', 'strong', 'title']

## Add Meta-Tag information to plain text

In [7]:
train["text"] = train["text"] + train["meta"]

## Trim HTML

In [8]:
train2 = train.head(10)

In [10]:
%%time
#train["html"] = train["html"].apply(lambda x: trim_html(x, tag_list = TAG_LIST, tagless_output_string=True))

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 4.53 µs


### Vectorizing text

In [54]:
%%time

train_text = train[TEXT_COL]
train_labels = train[CLASS_COL].values

vectorizer = CountVectorizer(max_df=MAX_DOCUMENT_FREQUENCY,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             ngram_range=NGRAM_RANGE,
                             stop_words=STOP_WORDS,
                            tokenizer=tokenizing_html)
transformer = TfidfTransformer()

vector = vectorizer.fit_transform(train_text)
train_vector = transformer.fit_transform(vector)

CPU times: user 8.46 s, sys: 63.5 ms, total: 8.52 s
Wall time: 8.51 s


# Test Dataset

In [55]:
%%time
test = pd.read_csv(TEST_PATH_CSV)
    
test_vector = vectorizer.transform(test[TEXT_COL].values)
test_vector = transformer.transform(test_vector)
test_labels = test[CLASS_COL].values

CPU times: user 2.18 s, sys: 28.6 ms, total: 2.21 s
Wall time: 2.21 s


# SGD

In [56]:
%%time
print("SGD CLF", "\n-------------------------")
# training
clf = SGDClassifier()
clf.fit(train_vector, train_labels)

# prediction
train_preds = clf.predict(test_vector)

# evaluation
precision = precision_score(test_labels, train_preds, average="macro", zero_division=0)
recall = recall_score(test_labels, train_preds, average="macro", zero_division=0)
f1 = f1_score(test_labels, train_preds, average="macro", zero_division=0)
clf1_f1 = np.round(f1, decimals=4)
clf1_precision = np.round(precision, decimals=4)

print(np.round(precision, decimals=4), "\tPrecision")
print(np.round(recall, decimals=4), "\tRecall")
print(np.round(f1, decimals=4), "\tF1")
print()

clf1_report = classification_report(test_labels, 
                                   train_preds, 
                                   target_names = np.unique(test[CLASS_NAMES]), 
                                   zero_division = 0)

SGD CLF 
-------------------------
0.6301 	Precision
0.5634 	Recall
0.5841 	F1

CPU times: user 1.09 s, sys: 0 ns, total: 1.09 s
Wall time: 1.08 s


# LSVM

In [57]:
%%time
print("LSVM CLF", "\n-------------------------")
# training
clf = LinearSVC()
clf.fit(train_vector, train_labels)

# prediction
train_preds = clf.predict(test_vector)

# evaluation
precision = precision_score(test_labels, train_preds, average="macro", zero_division=0)
recall = recall_score(test_labels, train_preds, average="macro", zero_division=0)
f1 = f1_score(test_labels, train_preds, average="macro", zero_division=0)
clf2_f1 = np.round(f1, decimals=4)
clf2_precision = np.round(precision, decimals=4)

print(np.round(precision, decimals=4), "\tPrecision")
print(np.round(recall, decimals=4), "\tRecall")
print(np.round(f1, decimals=4), "\tF1")
print()

clf2_report = classification_report(test_labels, 
                                   train_preds, 
                                   target_names = np.unique(test[CLASS_NAMES]),
                                   zero_division = 0)

LSVM CLF 
-------------------------
0.6257 	Precision
0.5523 	Recall
0.5778 	F1

CPU times: user 2.44 s, sys: 8.03 ms, total: 2.45 s
Wall time: 2.44 s


## Summary: Classification Results

In [58]:
result = "| "

if TEXT_COL == "text":
    result += "Plain Text"
else:
    result += "HTML"
    
if STOP_WORDS is None:
    result += " (kept stop words)"
    
if MAX_FEATURES is None:
    result += " (all features)"
else:
    result += f" ({MAX_FEATURES} features)"
    
if NGRAM_RANGE != (1,1):
    result += f" ({NGRAM_RANGE} ngrams)"
    
if ROWS:
    result += f" ({ROWS[1:]} rows)"
    
            
result += f" | **{clf1_f1}** ({clf1_precision}) | **{clf2_f1}** ({clf2_precision}) |"
print(CLASS_COL)
print()
print(result)

group_representative

| Plain Text (kept stop words) (10000 features) (10000 rows) | **0.5841** (0.6301) | **0.5778** (0.6257) |


# Confusion Matrix

TODO: label und text names und so; allg. änderungen von oben hier ergänzen

In [None]:
NORMALIZE_CM = True
INDUSTRY_TRESHOLD = 250
PLT_SCALING_FACTOR = 0.8

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

filtered_train = train.groupby(CLASS_COL).filter(lambda x: len(x)>INDUSTRY_TRESHOLD)
remaining_industries = filtered_train[CLASS_NAMES].drop_duplicates().tolist()


cnf_matrix = confusion_matrix(test_labels, train_preds)

classes = train[CLASS_COL].drop_duplicates().tolist()

cnf_df = pd.DataFrame(cnf_matrix, index=classes, columns=classes)
cnf_df = cnf_df[remaining_industries]
cnf_df = cnf_df.loc[remaining_industries]

In [None]:
plt.figure(figsize=(10*PLT_SCALING_FACTOR, 8*PLT_SCALING_FACTOR))

if NORMALIZE_CM:
    normalized_cnf_df = cnf_df.astype('float') / cnf_df.sum(axis=1)[:, np.newaxis]
    sns.heatmap(normalized_cnf_df, annot=True, cmap=sns.color_palette("Blues"), fmt='.2f')
else:
    sns.heatmap(cnf_df, annot=True, cmap=sns.color_palette("Blues"), fmt='g')
plt.tight_layout()