# Classify Industries with HTML information

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# sklearn classification
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

# sklearn general
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import (confusion_matrix, 
                             classification_report, 
                             f1_score, 
                             precision_score,
                             recall_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


from stop_words import get_stop_words
import ujson as json


import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from app.utils import (clean_boilerplate, 
                       clean_string,
                       clean_website, 
                       detect_XML, 
                       extract_meta_informations,
                       reduce_whitespace,
                       remove_tags, 
                       tokenizing_html, 
                       trim_html)

## Übersicht

[1. Idee](#1)<br>
[2. Ansätze](#1)<br>

## 1. Grundidee <a class="anchor" id="1"/>

- <u>Aktuell</u>: Textklassifizierung von Industries (multi-class)
- HTML Informationen nützlich für Klassifizierung von Industries?
    - primär strukturelles Markup
    - keine visuellen Informationen/Ansätze
- HTML Seite:

    ![html_page](img/html_page.PNG)


- Aktuelle Trainingsgrundlage (plain text + title aus MetaDaten (wenn vorhanden)):
    ```python3
    'Home\n\nMenü\n\n\nShop\nHome\nProdukte\nOur Story\nReferenzen\nTechnologie\nDokumente\nKontakt\nFAQ\nBlog\n\n\nTelefon: +49 (0)9805 933 43 50\n\nMenü\n\n\nShop\nHome\nProdukte\nOur Story\nReferenzen\nTechnologie\nDokumente\nKontakt\nFAQ\nBlog\n\n\nEst.\n\n2010\n\nActive Balancing\n\nGeben Sie sich bei Ihrem Stromspeicher nicht mit weniger zufrieden:\nHoher Wirkungsgrad und längere Lebensdauer\nerreichen Sie mit unsereR Active Balancing Technologie...'
    ```

- Viele (potentiell) nützliche Informationen fallen weg
- Neue Trainingsgrundlage: Plain Text + **HTML Informationen**
- Frage: <u>Welche</u> Informationen sind relevant? Was ist <u>Boilerplate</u>? Was hilft <u>nicht</u> bei der Klassifizierung?

## 2. Ansätze <a class="anchor" id="2"/>

#### 2.1. Informationen aus `<meta>`-Elementen verwenden

- Welche?
    - `keywords`, `description`
    - eigentlich auch `title`, ist jedoch schon im Plain Text
- Warum?
    - reich an Informationen
    - kurz und knapp
- Wie?
    - Plain Text + Plain Text von `<meta>`-Elementen
    - (*Clean HTML + Plain Text von `<meta>`-Elementen*)

#### 2.2. Nur ausgewählte Inhalte von HTML-Elemente beibehalten

- Welche?
    - `<title>`
    - `<h1>`, `<h2>`, `<h3>`
    - `<b>`, `<strong>`, `<em>`, `<i>`
    - `<p>`
    - `<a>`
    - `<li>`
- Warum?
    - informationstragende Elemente
    - nach:
        - ÖZEL 2011: "A web page classification system based on a genetic algorithm using tagged-terms as features"
        - LEE 2015: "Web page classification based on a simplified swarm optimization."
        - **These**: "Beobachtungen zeigen, dass die meisten der wichtigen domänenspezifischen Begriffe unter diesen Tags erscheinen und"
- Wie?
    - Plain Text ohne Textinhalte aus anderen Elementen
    - (*Clean HTML ohne Textinhalte aus anderen Elementen*)
   
   
   
<br><br><br><br>   
   
#### 2.?. Anchor Text und Nachbarn hervorheben

TODO: Qi nochmal lesen und gucken, ob und wie genau das funktioniert

- Welche?
    - Anchor Text (= klickbarer Text in einem Hyperlink) und Nachbarn extrahieren
- Warum?

## 3. Pipeline

`group_representative_label` hinzufügen<br>
&nbsp;&nbsp;  &dArr;<br>
`<meta>` Elemente extrahieren und als eigene Column<br>
&nbsp;&nbsp;  &dArr;<br>
`chtml` (= clean html) Column erstellen (ohne Boilerplate HTML, ohne Meta Informationen)<br>
&nbsp;&nbsp;  &dArr;<br>
TODO<br>
&nbsp;&nbsp;  &dArr;<br>
Train-Test-Split<br>
&nbsp;&nbsp;  &dArr;<br>
Vektorisierung von `train` mit TF-IDF<br>
&nbsp;&nbsp;  &dArr;<br>
Klassifizierung<br>
&nbsp;&nbsp;  &dArr;<br>
Evaluation

In [2]:
s = """      median_time  median_process_time  median_performance_time  default  enable_cpu_mem_arena  enable_mem_pattern             execution_mode                 execution_order graph_optimization_level inter_op_num_threads intra_op_num_threads  use_deterministic_compute
0       0.134089             0.134081                 0.134094    False                  True                True  execution_mode_sequential         execution_order_default              disable_all                    0                    0                       True
1       0.134324             0.134204                 0.134330    False                  True                True  execution_mode_sequential         execution_order_default              disable_all                    0                    0                      False
2       0.134481             0.134471                 0.134487    False                  True                True  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                       True
3       0.134709             0.134714                 0.134714    False                  True                True  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                      False
4       0.132064             0.132069                 0.132070    False                  True                True  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                       True
5       0.131872             0.131878                 0.131879    False                  True                True  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                      False
6       0.133006             0.133012                 0.133013    False                  True                True  execution_mode_sequential         execution_order_default               enable_all                    0                    0                       True
7       0.131654             0.131639                 0.131659     True                  True                True  execution_mode_sequential         execution_order_default               enable_all                    0                    0                      False
8       0.134118             0.134111                 0.134124    False                  True                True  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                       True
9       0.134276             0.134263                 0.134282    False                  True                True  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                      False
10      0.134151             0.134150                 0.134157    False                  True                True  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                       True
11      0.134330             0.134324                 0.134335    False                  True                True  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                      False
12      0.131724             0.131724                 0.131730    False                  True                True  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                       True
13      0.131622             0.131628                 0.131628    False                  True                True  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                      False
14      0.131840             0.131834                 0.131847    False                  True                True  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                       True
15      0.131876             0.131882                 0.131882    False                  True                True  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                      False
16      0.134376             0.134365                 0.134381    False                  True                True    execution_mode_parallel         execution_order_default              disable_all                    0                    0                       True
17      0.134442             0.134447                 0.134448    False                  True                True    execution_mode_parallel         execution_order_default              disable_all                    0                    0                      False
18      0.134179             0.134174                 0.134185    False                  True                True    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                       True
19      0.134569             0.134575                 0.134575    False                  True                True    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                      False
20      0.131785             0.131851                 0.131790    False                  True                True    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                       True
21      0.132013             0.132018                 0.132018    False                  True                True    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                      False
22      0.131812             0.131817                 0.131818    False                  True                True    execution_mode_parallel         execution_order_default               enable_all                    0                    0                       True
23      0.132037             0.132039                 0.132043    False                  True                True    execution_mode_parallel         execution_order_default               enable_all                    0                    0                      False
24      0.134193             0.134188                 0.134199    False                  True                True    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                       True
25      0.134582             0.134583                 0.134588    False                  True                True    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                      False
26      0.134102             0.134101                 0.134108    False                  True                True    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                       True
27      0.134358             0.134382                 0.134365    False                  True                True    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                      False
28      0.131987             0.131987                 0.131993    False                  True                True    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                       True
29      0.132225             0.132225                 0.132231    False                  True                True    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                      False
30      0.132032             0.132036                 0.132037    False                  True                True    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                       True
31      0.131955             0.131948                 0.131961    False                  True                True    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                      False
32      0.134413             0.134418                 0.134419    False                  True               False  execution_mode_sequential         execution_order_default              disable_all                    0                    0                       True
33      0.134618             0.134624                 0.134625    False                  True               False  execution_mode_sequential         execution_order_default              disable_all                    0                    0                      False
34      0.134634             0.134634                 0.134640    False                  True               False  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                       True
35      0.134583             0.134583                 0.134590    False                  True               False  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                      False
36      0.131830             0.131835                 0.131836    False                  True               False  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                       True
37      0.131685             0.131691                 0.131692    False                  True               False  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                      False
38      0.131912             0.131916                 0.131919    False                  True               False  execution_mode_sequential         execution_order_default               enable_all                    0                    0                       True
39      0.131781             0.131781                 0.131787    False                  True               False  execution_mode_sequential         execution_order_default               enable_all                    0                    0                      False
40      0.134430             0.134436                 0.134437    False                  True               False  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                       True
41      0.134663             0.134662                 0.134669    False                  True               False  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                      False
42      0.134346             0.134291                 0.134351    False                  True               False  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                       True
43      0.134416             0.134407                 0.134422    False                  True               False  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                      False
44      0.131805             0.131810                 0.131811    False                  True               False  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                       True
45      0.132094             0.132092                 0.132100    False                  True               False  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                      False
46      0.131838             0.131838                 0.131844    False                  True               False  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                       True
47      0.131829             0.131824                 0.131834    False                  True               False  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                      False
48      0.134480             0.134473                 0.134486    False                  True               False    execution_mode_parallel         execution_order_default              disable_all                    0                    0                       True
49      0.134638             0.134636                 0.134645    False                  True               False    execution_mode_parallel         execution_order_default              disable_all                    0                    0                      False
50      0.134383             0.134388                 0.134389    False                  True               False    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                       True
51      0.134425             0.134439                 0.134432    False                  True               False    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                      False
52      0.131790             0.131782                 0.131796    False                  True               False    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                       True
53      0.132056             0.132061                 0.132062    False                  True               False    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                      False
54      0.131903             0.131906                 0.131909    False                  True               False    execution_mode_parallel         execution_order_default               enable_all                    0                    0                       True
55      0.132975             0.133065                 0.132981    False                  True               False    execution_mode_parallel         execution_order_default               enable_all                    0                    0                      False
56      0.136097             0.136081                 0.136102    False                  True               False    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                       True
57      0.136088             0.136094                 0.136095    False                  True               False    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                      False
58      0.136143             0.136143                 0.136149    False                  True               False    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                       True
59      0.135039             0.135039                 0.135045    False                  True               False    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                      False
60      0.132520             0.132515                 0.132526    False                  True               False    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                       True
61      0.132526             0.132602                 0.132532    False                  True               False    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                      False
62      0.132534             0.132535                 0.132541    False                  True               False    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                       True
63      0.132312             0.132305                 0.132318    False                  True               False    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                      False
64      0.136373             0.136379                 0.136379    False                 False                True  execution_mode_sequential         execution_order_default              disable_all                    0                    0                       True
65      0.136020             0.136025                 0.136026    False                 False                True  execution_mode_sequential         execution_order_default              disable_all                    0                    0                      False
66      0.136050             0.136055                 0.136056    False                 False                True  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                       True
67      0.136476             0.136475                 0.136481    False                 False                True  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                      False
68      0.134379             0.134385                 0.134385    False                 False                True  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                       True
69      0.134008             0.134009                 0.134014    False                 False                True  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                      False
70      0.133769             0.133761                 0.133774    False                 False                True  execution_mode_sequential         execution_order_default               enable_all                    0                    0                       True
71      0.134055             0.134117                 0.134060    False                 False                True  execution_mode_sequential         execution_order_default               enable_all                    0                    0                      False
72      0.136274             0.136279                 0.136280    False                 False                True  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                       True
73      0.136863             0.136869                 0.136870    False                 False                True  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                      False
74      0.137498             0.137499                 0.137505    False                 False                True  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                       True
75      0.138503             0.138504                 0.138509    False                 False                True  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                      False
76      0.135320             0.135320                 0.135326    False                 False                True  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                       True
77      0.134357             0.134363                 0.134363    False                 False                True  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                      False
78      0.134988             0.134989                 0.134994    False                 False                True  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                       True
79      0.135722             0.135722                 0.135728    False                 False                True  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                      False
80      0.137856             0.137862                 0.137863    False                 False                True    execution_mode_parallel         execution_order_default              disable_all                    0                    0                       True
81      0.136792             0.136863                 0.136799    False                 False                True    execution_mode_parallel         execution_order_default              disable_all                    0                    0                      False
82      0.135922             0.135918                 0.135928    False                 False                True    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                       True
83      0.136364             0.136365                 0.136371    False                 False                True    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                      False
84      0.134013             0.134013                 0.134019    False                 False                True    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                       True
85      0.134005             0.134010                 0.134011    False                 False                True    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                      False
86      0.133684             0.133675                 0.133690    False                 False                True    execution_mode_parallel         execution_order_default               enable_all                    0                    0                       True
87      0.133535             0.133539                 0.133540    False                 False                True    execution_mode_parallel         execution_order_default               enable_all                    0                    0                      False
88      0.136594             0.136593                 0.136599    False                 False                True    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                       True
89      0.137093             0.137094                 0.137099    False                 False                True    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                      False
90      0.137489             0.137488                 0.137494    False                 False                True    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                       True
91      0.137646             0.137641                 0.137652    False                 False                True    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                      False
92      0.135146             0.135153                 0.135153    False                 False                True    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                       True
93      0.134904             0.134896                 0.134910    False                 False                True    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                      False
94      0.135050             0.135055                 0.135055    False                 False                True    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                       True
95      0.135300             0.135305                 0.135306    False                 False                True    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                      False
96      0.137544             0.137546                 0.137550    False                 False               False  execution_mode_sequential         execution_order_default              disable_all                    0                    0                       True
97      0.137561             0.137565                 0.137566    False                 False               False  execution_mode_sequential         execution_order_default              disable_all                    0                    0                      False
98      0.137370             0.137358                 0.137376    False                 False               False  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                       True
99      0.137377             0.137382                 0.137383    False                 False               False  execution_mode_sequential         execution_order_default             enable_basic                    0                    0                      False
100     0.134677             0.134676                 0.134683    False                 False               False  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                       True
101     0.135157             0.135156                 0.135162    False                 False               False  execution_mode_sequential         execution_order_default          enable_extended                    0                    0                      False
102     0.135108             0.135087                 0.135115    False                 False               False  execution_mode_sequential         execution_order_default               enable_all                    0                    0                       True
103     0.135172             0.135167                 0.135178    False                 False               False  execution_mode_sequential         execution_order_default               enable_all                    0                    0                      False
104     0.136793             0.136798                 0.136799    False                 False               False  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                       True
105     0.136224             0.136220                 0.136229    False                 False               False  execution_mode_sequential  execution_order_priority_based              disable_all                    0                    0                      False
106     0.136427             0.136427                 0.136433    False                 False               False  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                       True
107     0.136721             0.136719                 0.136727    False                 False               False  execution_mode_sequential  execution_order_priority_based             enable_basic                    0                    0                      False
108     0.133576             0.133428                 0.133582    False                 False               False  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                       True
109     0.134254             0.134259                 0.134260    False                 False               False  execution_mode_sequential  execution_order_priority_based          enable_extended                    0                    0                      False
110     0.133296             0.133288                 0.133302    False                 False               False  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                       True
111     0.133844             0.133925                 0.133850    False                 False               False  execution_mode_sequential  execution_order_priority_based               enable_all                    0                    0                      False
112     0.136552             0.136557                 0.136558    False                 False               False    execution_mode_parallel         execution_order_default              disable_all                    0                    0                       True
113     0.136553             0.136554                 0.136560    False                 False               False    execution_mode_parallel         execution_order_default              disable_all                    0                    0                      False
114     0.136452             0.136459                 0.136459    False                 False               False    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                       True
115     0.136739             0.136745                 0.136746    False                 False               False    execution_mode_parallel         execution_order_default             enable_basic                    0                    0                      False
116     0.134016             0.134015                 0.134021    False                 False               False    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                       True
117     0.133918             0.133923                 0.133923    False                 False               False    execution_mode_parallel         execution_order_default          enable_extended                    0                    0                      False
118     0.133750             0.133750                 0.133756    False                 False               False    execution_mode_parallel         execution_order_default               enable_all                    0                    0                       True
119     0.134081             0.134087                 0.134088    False                 False               False    execution_mode_parallel         execution_order_default               enable_all                    0                    0                      False
120     0.137129             0.137134                 0.137134    False                 False               False    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                       True
121     0.136976             0.136981                 0.136981    False                 False               False    execution_mode_parallel  execution_order_priority_based              disable_all                    0                    0                      False
122     0.137045             0.137120                 0.137051    False                 False               False    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                       True
123     0.137390             0.137386                 0.137396    False                 False               False    execution_mode_parallel  execution_order_priority_based             enable_basic                    0                    0                      False
124     0.134379             0.134369                 0.134385    False                 False               False    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                       True
125     0.133712             0.133717                 0.133718    False                 False               False    execution_mode_parallel  execution_order_priority_based          enable_extended                    0                    0                      False
126     0.134063             0.134073                 0.134070    False                 False               False    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                       True
127     0.133988             0.133978                 0.133994    False                 False               False    execution_mode_parallel  execution_order_priority_based               enable_all                    0                    0                      False"""

In [3]:
from io import StringIO
df = pd.read_csv(StringIO(s), sep="\s+")

### Results

- id 13: best `median_process_time`: 0.131628
- id 13: best `median_performance_time`: 0.131628
- id 13: best `median_time`: 0.131622

Compared to DEFAULT (id 7):
- `median_process_time`: 0.131639 (slower than best (~0.008%))
- `median_performance_time`: 0.131659 (slower than best (~0.023%))
- `median_time`: 0.131654 (slower than best (~0.024%))

Compared to SnapADDY DEFAULT (id 23):
- `median_process_time`: 0.132039 (slower than best (~0.31%))
- `median_performance_time`: 0.132043 (slower than best (~0.31%))
- `median_time`: 0.132037 (slower than best (~0.31%))

In [67]:
best = 0.131622
default = 0.132037
best = best*1000
default = default*1000
milliseconds = default - best
percent = 1 - (best / default)
print('%f'%milliseconds, '%f'%(percent*100))

0.415000 0.314306


In [44]:
df[df.default == True]

Unnamed: 0,median_time,median_process_time,median_performance_time,default,enable_cpu_mem_arena,enable_mem_pattern,execution_mode,execution_order,graph_optimization_level,inter_op_num_threads,intra_op_num_threads,use_deterministic_compute
7,0.131654,0.131639,0.131659,True,True,True,execution_mode_sequential,execution_order_default,enable_all,0,0,False


In [59]:
df[(df.execution_mode == "execution_mode_parallel") &
   (df.enable_cpu_mem_arena == True) &
   (df.enable_mem_pattern == True) &
   (df.execution_order == "execution_order_default") &
   (df.graph_optimization_level == "enable_all") &
   (df.use_deterministic_compute == False)].head(5)

Unnamed: 0,median_time,median_process_time,median_performance_time,default,enable_cpu_mem_arena,enable_mem_pattern,execution_mode,execution_order,graph_optimization_level,inter_op_num_threads,intra_op_num_threads,use_deterministic_compute
23,0.132037,0.132039,0.132043,False,True,True,execution_mode_parallel,execution_order_default,enable_all,0,0,False


In [42]:
df.sort_values(by="median_process_time").tail(3)

Unnamed: 0,median_time,median_process_time,median_performance_time,default,enable_cpu_mem_arena,enable_mem_pattern,execution_mode,execution_order,graph_optimization_level,inter_op_num_threads,intra_op_num_threads,use_deterministic_compute
91,0.137646,0.137641,0.137652,False,False,True,execution_mode_parallel,execution_order_priority_based,enable_basic,0,0,False
80,0.137856,0.137862,0.137863,False,False,True,execution_mode_parallel,execution_order_default,disable_all,0,0,True
75,0.138503,0.138504,0.138509,False,False,True,execution_mode_sequential,execution_order_priority_based,enable_basic,0,0,False


In [43]:
df.sort_values(by="median_process_time").head(20)

Unnamed: 0,median_time,median_process_time,median_performance_time,default,enable_cpu_mem_arena,enable_mem_pattern,execution_mode,execution_order,graph_optimization_level,inter_op_num_threads,intra_op_num_threads,use_deterministic_compute
13,0.131622,0.131628,0.131628,False,True,True,execution_mode_sequential,execution_order_priority_based,enable_extended,0,0,False
7,0.131654,0.131639,0.131659,True,True,True,execution_mode_sequential,execution_order_default,enable_all,0,0,False
37,0.131685,0.131691,0.131692,False,True,False,execution_mode_sequential,execution_order_default,enable_extended,0,0,False
12,0.131724,0.131724,0.13173,False,True,True,execution_mode_sequential,execution_order_priority_based,enable_extended,0,0,True
39,0.131781,0.131781,0.131787,False,True,False,execution_mode_sequential,execution_order_default,enable_all,0,0,False
52,0.13179,0.131782,0.131796,False,True,False,execution_mode_parallel,execution_order_default,enable_extended,0,0,True
44,0.131805,0.13181,0.131811,False,True,False,execution_mode_sequential,execution_order_priority_based,enable_extended,0,0,True
22,0.131812,0.131817,0.131818,False,True,True,execution_mode_parallel,execution_order_default,enable_all,0,0,True
47,0.131829,0.131824,0.131834,False,True,False,execution_mode_sequential,execution_order_priority_based,enable_all,0,0,False
14,0.13184,0.131834,0.131847,False,True,True,execution_mode_sequential,execution_order_priority_based,enable_all,0,0,True


In [20]:
df.sort_values(by="median_performance_time").head(3)

Unnamed: 0,median_time,median_process_time,median_performance_time,default,enable_cpu_mem_arena,enable_mem_pattern,execution_mode,execution_order,graph_optimization_level,inter_op_num_threads,intra_op_num_threads,use_deterministic_compute
13,0.131622,0.131628,0.131628,False,True,True,execution_mode_sequential,execution_order_priority_based,enable_extended,0,0,False
7,0.131654,0.131639,0.131659,True,True,True,execution_mode_sequential,execution_order_default,enable_all,0,0,False
37,0.131685,0.131691,0.131692,False,True,False,execution_mode_sequential,execution_order_default,enable_extended,0,0,False


In [12]:
df.sort_values(by="median_time").head(3)

Unnamed: 0,median_time,median_process_time,median_performance_time,default,enable_cpu_mem_arena,enable_mem_pattern,execution_mode,execution_order,graph_optimization_level,inter_op_num_threads,intra_op_num_threads,use_deterministic_compute
13,0.131622,0.131628,0.131628,False,True,True,execution_mode_sequential,execution_order_priority_based,enable_extended,0,0,False
7,0.131654,0.131639,0.131659,True,True,True,execution_mode_sequential,execution_order_default,enable_all,0,0,False
37,0.131685,0.131691,0.131692,False,True,False,execution_mode_sequential,execution_order_default,enable_extended,0,0,False


In [95]:
indices = []
about_html = ["ueber-uns.htm", 
              "about.htm", 
              "team.htm", 
              "us.htm", 
              'title="abo"', 
              'title="tea"', 
              'title="uebe"']
about_html = ["us.htm"]
for idx, row in train.iterrows():
    for element in about_html:
        if element in row["html"].lower():
            indices.append(idx)
            
indices = list(set(indices))

In [96]:
len(indices)

110

In [97]:
train.iloc[86].html.find("us.htm")

65802

In [77]:
train.iloc[86].html[65000:67000]

'.techfetch.com/category/webinars/">Webinars</a>\r\n                             <a class="link" title="Job Fairs" target="_blank" href="/tech-job-fair/default.htm">Job Fairs</a>\r\n                              <a class="link" title="Events" target="_blank" href="/events/event.htm">Events</a>\r\n                            <a class="link" title="Skill Score" target="skillScore" href="/skillscore/">Skill Score</a>\r\n                           \r\n                            \r\n                        </div>\r\n                    </div>\r\n                    <div class="col-lg-6 col-md-6 col-sm-5 col-xs-12 hidden-xs">\r\n                        <div id="miniheader-top-second" class="clearfix">\r\n                            <div class="header-top-dropdown">\r\n                                <a href="/ads/aboutus.htm" class="link" target="_blank" title="About Us">About Us</a>\r\n                                <a class="pop link" title="Contact Us" id="contactpopup" onclick="javascr

In [60]:
for idx in indices:
    about_idx = train.iloc[idx].html.find("ueber-uns.html")
    print(train.iloc[idx].html[about_idx-50:about_idx + 50])
    #print(train.iloc[idx].html)

ub-lvl3" style=""> <a class="" href="wolfsburg-ag/ueber-uns.html" target="" title="Über uns"> <span>


In [87]:
%%time
train = pd.read_csv("../data/ctrain.csv", nrows=10000)
train = train.fillna("")
train.head(1)

CPU times: user 3.91 s, sys: 618 ms, total: 4.53 s
Wall time: 5.41 s


Unnamed: 0,url,industry,industry_label,group,group_representative,html,text,source,country,group_representative_label,meta,chtml
0,http://www.energy-net.de,8,Telecommunications,"gov, tech",8,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""de-DE"">...",Energy Net Apple Reseller\n\nSpringe zum Inhal...,xing,DE,Telecommunications,Apple Produkte\r\nApple Reseller\r\nApple im U...,<html>\n<head>\n\n\n\n\n\n\n<title>Energy Net ...


In [41]:
%%time

TEXT_COL = "text"
CLASS_COL = "group_representative"

train_text = train[TEXT_COL] + train["meta"]
train_labels = train[CLASS_COL].values

vectorizer = CountVectorizer(max_df=MAX_DOCUMENT_FREQUENCY,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             ngram_range=NGRAM_RANGE,
                             stop_words=STOP_WORDS,
                            tokenizer=tokenizing_html)
transformer = TfidfTransformer()

vector = vectorizer.fit_transform(train_text)
train_vector = transformer.fit_transform(vector)


test = pd.read_csv(TEST_PATH_CSV)
    
test_vector = vectorizer.transform(test[TEXT_COL].values)
test_vector = transformer.transform(test_vector)
test_labels = test[CLASS_COL].values


print("LSVM CLF", "\n-------------------------")
# training
clf = LinearSVC()
clf.fit(train_vector, train_labels)

# prediction
train_preds = clf.predict(test_vector)

# evaluation
precision = precision_score(test_labels, train_preds, average="macro", zero_division=0)
recall = recall_score(test_labels, train_preds, average="macro", zero_division=0)
f1 = f1_score(test_labels, train_preds, average="macro", zero_division=0)
clf2_f1 = np.round(f1, decimals=4)
clf2_precision = np.round(precision, decimals=4)

print(np.round(precision, decimals=4), "\tPrecision")
print(np.round(recall, decimals=4), "\tRecall")
print(np.round(f1, decimals=4), "\tF1")
print()

clf2_report = classification_report(test_labels, 
                                   train_preds, 
                                   target_names = np.unique(test[CLASS_NAMES]),
                                   zero_division = 0)

LSVM CLF 
-------------------------
0.6048 	Precision
0.358 	Recall
0.4111 	F1

CPU times: user 2.72 s, sys: 28.9 ms, total: 2.75 s
Wall time: 2.75 s


In [40]:
train.meta.iloc[1]

'Für Unternehmen, Agenturen und Entwickler » Online Marketing Beratung & Optimierung: SEO, Social-Media, Online-Werbung, Webentwicklung ➥ Alle Infos hier!'

In [39]:
train_text[1]

'STENLE ⇗ Online Marketing für Unternehmen und Agenturen\n\nHome\nWeb Analytics\n\nBran\xadchen\xadum\xadfeld – Analyse\nSEO Ran\xadking\xadana\xadlyse\nOffpage-SEO Analyse\nWett\xadbe\xadwerber Analyse\n\n\nOnline Marketing\nWeb-Entwicklung\nRund um SEO\nSTENLE GmbH\nKontakt\n\n\nOnline-Mar\xadke\xadting\nOptimierung\n\nOnline Mar\xadke\xadting Beratung\n\nOnline\xadmarkt-For\xadschung\n\nWeb-Ent\xadwick\xadlungen\n\nHilfe bei\n\nKun\xadden\xadzu\xadfie\xadden\xadheit\n\nWarum mit STENLE?\n\nHilfe bei\n\nKun\xadden\xadzu\xadfie\xadden\xadheit\n\nWarum mit STENLE?\n\nKun\xadden\xadser\xadvice\n\n★★★★★ 4,82 von 5\n\nSorgfalt\n\n★★★★★ 5,00 von 5\n\nZusatz\xadleis\xadtungen\n\n★★★★★ 5,00 von 5\n\nFle\xadxi\xadbi\xadlität\n\n★★★★★ 5,00 von 5\n\nErrei\xadchen Sie Ihre Ziele mit  STENLE\n\nSebas\xadtian Dietz\n\nImmer wieder sprechen Inter\xades\xadsenten in unseren Erst\xadge\xadsprä\xadchen von einem Gefühl der Ori\xaden\xadtie\xadrungs\xadlo\xadsig\xadkeit bei der Auswahl des rich\xadtige

- plain: 0.3994 F1
- html: 0.2784 	F1
- plain + meta: 0.4111 	F1
- plain + 2 * meta: 0.4063 	F1
- html + meta: 0.2784 	F1

## IDEEN


### HTML Klassifizierung


- Text zusammenfassen und dann klassifizieren? Dafür auch HTML-Tags verwenden?
- ~~HTML Struktur verwenden, um vorher **Boilerplate Content** von Main Content zu entfernen:~~
    - ~~Plain Text ist sehr noisy (viel unnötiges drin)~~
    - ICH: gemacht mit CLEAN HTML, aber ohne explizites Boilerplate Content Removal
- Bestimmten Wörtern/Tags höhere Gewichtungen geben
    - Anchor Text (= klickbarer Text in einem Hyperlink)
        - alleine zu wenig Inhalt (QI, S. 12)
        - umliegende Wörter interessant! (QI, S. 12)
        - auch für Nachbar-Seiten-Ansatz
    - Title, Headers (QI, S. 12)
        - auch für Nachbar-Seiten-Ansatz
    - Keywords für Branchen
        ```python3  
        Class_1_keywords = ['Office', 'School', 'phone', 'Technology', 'Electronics', 'Cell', 'Business', 'Education', 'Classroom']
        
        Class_2_keywords = ['Restaurant', 'Hospitality', 'Tub', 'Drain', 'Pool', 'Filtration', 'Floor', 'Restroom', 'Consumer', 'Care', 'Bags', 'Disposables']
        
        Class_3_keywords = ['Pull', 'Lifts', 'Pneumatic', 'Emergency', 'Finishing', 'Hydraulic', 'Lockout', 'Towers', 'Drywall', 'Tools', 'Packaging', 'Measure', 'Tag ']
        ```
- NER mit **Tags** als weitere Tokens
- Features von "Nachbarseiten" verwenden
    - Hilfreich, da mehr Infos als "nur" Startseite
    - Fragen: 
        - Was sind Nachbarseiten, wie definieren?
            - Webgraph Webseiten?
            - Weitere Seiten des Unternehmens?
        - Wie viele Nachbarseiten?
        - Wieviel von den Nachbarseiten verwenden?
            - Ganze Seite?
            - text, title, heading, Metadaten?
    
- *Weiteres*:
    - Flat classification oder Hierarchical classification?
        - Flat: parallele Klassen
        - Hierarchical: hierarchische Klassen, bauen aufeinander auf
    - Nur nach bestimmten Keywords filtern? (das geht jedoch mehr Richtung PLAIN-Textclassification)
    - "implicit links": Seiten, die beide bei Suche von **Suchmaschine** erschienen sind und auf die beide der User geklickt hat (QI, S. 12) &rarr; nicht wirklich realisierbar




## Tests

- Evaluation metric: **F1 Scores**
- TF-IDF Vectorizer
    - kein lowercase
    - stop words werden entfernt
    - keine max features
- Top $n$ classes = most frequent classes
- CLEAN HTML auch für Test Set (ansonsten unglaublich schlechte Accuracy und etwas sinnlos)


#### Label: `group_representatives`

| Experiment | SGD F1 (Precision) | LSVM F1 (Precision) |
| ---------- |:-----:| ----:|
| HTML (10000 features) | **0.5292** (0.5962) | **0.5493** (0.6371) |
| HTML (kept stop words) (10000 features) | **0.5268** (0.5845) | **0.5473** (0.6439) |
| HTML (10000 features) ((1, 3) ngrams) | **0.4035** (0.463) | **0.4188** (0.5345) |
| HTML (10000 features) ((2, 2) ngrams) | **0.2442** (0.2787) | **0.252** (0.3146) |
| ---------- |-----| ----|
| *ALL LANGS* HTML (kept stop words) (10000 features) | **0.5781** (0.6464) | **0.6406** (0.7024) |
| ---------- |-----| ----|
| Plain Text (kept stop words) (10000 features) (10000 rows) | **0.5841** (0.6301) | **0.5778** (0.6257) |
| Plain Text + Meta (kept stop words) (10000 features) (10000 rows) | **0.5832** (0.6197) | **0.5826** (0.6279) |

## Paths

In [2]:
DATA_DIR_PATH = "../data/"
LANG = ""
ROWS = "_10000"

INDUSTRIES_PATH_CSV = DATA_DIR_PATH + "industries.csv"
TRAIN_PATH_CSV = DATA_DIR_PATH + "train" + LANG + ROWS + ".csv"
TEST_PATH_CSV = DATA_DIR_PATH + "test" + LANG + ROWS +".csv"

## Load train csv

In [3]:
%%time
train = pd.read_csv(TRAIN_PATH_CSV)
train = train.fillna("")

CPU times: user 1.55 s, sys: 421 ms, total: 1.97 s
Wall time: 1.97 s


In [4]:
train.head(1)

Unnamed: 0,url,industry,industry_label,group,group_representative,html,text,source,country,group_representative_label,meta
0,http://www.autarctech.de,144,Renewables & Environment,"gov, man, org",144,<html> <head> ...,Home\n\nMenü\n\n\nShop\nHome\nProdukte\nOur St...,xing,DE,Renewables & Environment,Effizenz bei der Stromspeicherung in Batterien...


In [5]:
train.shape

(8000, 11)

## Hyperparameters

In [6]:
# "text" or "html"
TEXT_COL = "text"

# "group_representative", "group_representative_label", "industry", "industry_label" or "group"
CLASS_COL = "group_representative"
CLASS_NAMES = "group_representative_label"

MAX_DOCUMENT_FREQUENCY = 1.
MAX_FEATURES = 10000
NGRAM_RANGE = (1,1)
LOWERCASE = False
#STOP_WORDS = get_stop_words("de")
STOP_WORDS = None

TAG_LIST = ['a', 'b', 'em', 'h1', 'h2', 'h3', 'i', 'li', 'p', 'strong', 'title']

## Add Meta-Tag information to plain text

In [7]:
train["text"] = train["text"] + train["meta"]

## Trim HTML

In [8]:
train2 = train.head(10)

In [10]:
%%time
#train["html"] = train["html"].apply(lambda x: trim_html(x, tag_list = TAG_LIST, tagless_output_string=True))

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 4.53 µs


### Vectorizing text

In [54]:
%%time

train_text = train[TEXT_COL]
train_labels = train[CLASS_COL].values

vectorizer = CountVectorizer(max_df=MAX_DOCUMENT_FREQUENCY,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             ngram_range=NGRAM_RANGE,
                             stop_words=STOP_WORDS,
                            tokenizer=tokenizing_html)
transformer = TfidfTransformer()

vector = vectorizer.fit_transform(train_text)
train_vector = transformer.fit_transform(vector)

CPU times: user 8.46 s, sys: 63.5 ms, total: 8.52 s
Wall time: 8.51 s


# Test Dataset

In [55]:
%%time
test = pd.read_csv(TEST_PATH_CSV)
    
test_vector = vectorizer.transform(test[TEXT_COL].values)
test_vector = transformer.transform(test_vector)
test_labels = test[CLASS_COL].values

CPU times: user 2.18 s, sys: 28.6 ms, total: 2.21 s
Wall time: 2.21 s


# SGD

In [56]:
%%time
print("SGD CLF", "\n-------------------------")
# training
clf = SGDClassifier()
clf.fit(train_vector, train_labels)

# prediction
train_preds = clf.predict(test_vector)

# evaluation
precision = precision_score(test_labels, train_preds, average="macro", zero_division=0)
recall = recall_score(test_labels, train_preds, average="macro", zero_division=0)
f1 = f1_score(test_labels, train_preds, average="macro", zero_division=0)
clf1_f1 = np.round(f1, decimals=4)
clf1_precision = np.round(precision, decimals=4)

print(np.round(precision, decimals=4), "\tPrecision")
print(np.round(recall, decimals=4), "\tRecall")
print(np.round(f1, decimals=4), "\tF1")
print()

clf1_report = classification_report(test_labels, 
                                   train_preds, 
                                   target_names = np.unique(test[CLASS_NAMES]), 
                                   zero_division = 0)

SGD CLF 
-------------------------
0.6301 	Precision
0.5634 	Recall
0.5841 	F1

CPU times: user 1.09 s, sys: 0 ns, total: 1.09 s
Wall time: 1.08 s


# LSVM

In [57]:
%%time
print("LSVM CLF", "\n-------------------------")
# training
clf = LinearSVC()
clf.fit(train_vector, train_labels)

# prediction
train_preds = clf.predict(test_vector)

# evaluation
precision = precision_score(test_labels, train_preds, average="macro", zero_division=0)
recall = recall_score(test_labels, train_preds, average="macro", zero_division=0)
f1 = f1_score(test_labels, train_preds, average="macro", zero_division=0)
clf2_f1 = np.round(f1, decimals=4)
clf2_precision = np.round(precision, decimals=4)

print(np.round(precision, decimals=4), "\tPrecision")
print(np.round(recall, decimals=4), "\tRecall")
print(np.round(f1, decimals=4), "\tF1")
print()

clf2_report = classification_report(test_labels, 
                                   train_preds, 
                                   target_names = np.unique(test[CLASS_NAMES]),
                                   zero_division = 0)

LSVM CLF 
-------------------------
0.6257 	Precision
0.5523 	Recall
0.5778 	F1

CPU times: user 2.44 s, sys: 8.03 ms, total: 2.45 s
Wall time: 2.44 s


## Summary: Classification Results

In [58]:
result = "| "

if TEXT_COL == "text":
    result += "Plain Text"
else:
    result += "HTML"
    
if STOP_WORDS is None:
    result += " (kept stop words)"
    
if MAX_FEATURES is None:
    result += " (all features)"
else:
    result += f" ({MAX_FEATURES} features)"
    
if NGRAM_RANGE != (1,1):
    result += f" ({NGRAM_RANGE} ngrams)"
    
if ROWS:
    result += f" ({ROWS[1:]} rows)"
    
            
result += f" | **{clf1_f1}** ({clf1_precision}) | **{clf2_f1}** ({clf2_precision}) |"
print(CLASS_COL)
print()
print(result)

group_representative

| Plain Text (kept stop words) (10000 features) (10000 rows) | **0.5841** (0.6301) | **0.5778** (0.6257) |


# Confusion Matrix

TODO: label und text names und so; allg. änderungen von oben hier ergänzen

In [None]:
NORMALIZE_CM = True
INDUSTRY_TRESHOLD = 250
PLT_SCALING_FACTOR = 0.8

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

filtered_train = train.groupby(CLASS_COL).filter(lambda x: len(x)>INDUSTRY_TRESHOLD)
remaining_industries = filtered_train[CLASS_NAMES].drop_duplicates().tolist()


cnf_matrix = confusion_matrix(test_labels, train_preds)

classes = train[CLASS_COL].drop_duplicates().tolist()

cnf_df = pd.DataFrame(cnf_matrix, index=classes, columns=classes)
cnf_df = cnf_df[remaining_industries]
cnf_df = cnf_df.loc[remaining_industries]

In [None]:
plt.figure(figsize=(10*PLT_SCALING_FACTOR, 8*PLT_SCALING_FACTOR))

if NORMALIZE_CM:
    normalized_cnf_df = cnf_df.astype('float') / cnf_df.sum(axis=1)[:, np.newaxis]
    sns.heatmap(normalized_cnf_df, annot=True, cmap=sns.color_palette("Blues"), fmt='.2f')
else:
    sns.heatmap(cnf_df, annot=True, cmap=sns.color_palette("Blues"), fmt='g')
plt.tight_layout()