In [87]:
import pandas as pd

import spacy
import collections

from bs4 import BeautifulSoup as BS

nlp = spacy.load('de_core_news_sm')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [88]:
df_tag_category = pd.read_parquet('data/tag_category.parquet')
df_tag = pd.read_parquet('data/tag.parquet')

df_event = pd.read_parquet('data/event.parquet')
df_event_tags = pd.read_parquet('data/event_tags.parquet')
df_event_material_list = pd.read_parquet('data/event_material_list.parquet')

df_experiment = pd.read_parquet('data/experiment.parquet')
df_experiment_item = pd.read_parquet('data/experiment_item.parquet')

df_request_log = pd.read_parquet('data/request_log.parquet')

In [89]:
# first Experiment
df_experiment_item = df_experiment_item[df_experiment_item['created_at'] > '2021-03-10']
df_experiment_item = df_experiment_item[df_experiment_item['created_at'] < '2021-03-27']

In [90]:
def num_apperances_of_tag(html, tag):
    soup = BS(html)
    return len(soup.find_all(tag))

def get_text(html):
    soup = BS(html)
    whitelist = [
      'p',
      'li',
      'span'
    ]
    text_elements = [t for t in soup.find_all(text=True) if t.parent.name in whitelist]
    return ' '.join(text_elements)
  
def count_char(string, char):
    return string.count(char)

In [91]:
# html features
df_features = df_event

df_features['p_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('p',))
df_features['img_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('img',))
df_features['b_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('b',))
df_features['li_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('li',))
df_features['span_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('span',))
df_features['a_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('a',))
df_features['u_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('u',))

#
df_features['costsRating_quote'] = df_features['costsRating'] / 5
df_features['executionTimeRating_quote'] = df_features['executionTimeRating'] / 5
df_features['isPrepairationNeeded_quote'] = df_features['isPrepairationNeeded'] / 5

# header image
df_features['header_img'] = pd.notna(df_features['imageLink'])

# text features
df_features['text'] = df_features['description'].apply(get_text)
df_features['count_question_mark'] = df_features['text'].apply(count_char, args=('?',)) / 20
df_features['title_len'] = df_features['title'].str.len() / 40
df_features['text_len'] = df_features['text'].str.len() / 2000

In [92]:
# NLP

In [93]:
dicts_syntc = {
    "ADJ": "Adjektiv",
    "ADP": "Adposition",
    "ADV": "Adverb",
    "AUX": "Hilfsverb",
    "CONJ": "Koordinierende Konjunktionen",
    "DET": "Artikel",
    "INTJ": "Ausruf",
    "NOUN": "Nomen",
    "NUM": "Numerisch",
    "PART": "particle",
    "PRON": "Pronomen",
    "PROPN": "Eigenname",
    "PUNCT": "Satzzeichen",
    "SCONJ": "unterordnende Konjunktion",
    "SYM": "Symbol",
    "VERB": "Verb",
    "CCONJ": "Konjunktion"
}

In [94]:
df_temp = df_features

tokens = []
pos = []

for doc in nlp.pipe(df_features['text'].astype('unicode').values, batch_size=50):
    if doc.is_parsed:
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        pos.append(None)

df_temp['species_pos'] = pos

In [95]:
df_temp['total_words'] = df_temp['species_pos'].apply(lambda x: len(x))
df_temp['total_nomen'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'NOUN' in x]))
df_temp['total_adjektive'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'ADJ' in x]))
df_temp['total_numerisch'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'NUM' in x]))
df_temp['total_satzzeichen'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'PUNCT' in x]))
df_temp['total_konjunktion'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'CCONJ' in x]))

df_features['nomen_quote'] = df_temp['total_nomen'] / df_temp['total_words']
df_features['adjektive_quote'] = df_temp['total_adjektive'] / df_temp['total_words']
df_features['numerisch_quote'] = df_temp['total_numerisch'] / df_temp['total_words']
df_features['satzzeichen_quote'] = df_temp['total_satzzeichen'] / df_temp['total_words']
df_features['konjunktion_quote'] = df_temp['total_konjunktion'] / df_temp['total_words']

In [96]:
# Tag Features

In [97]:
dict_tag_name = df_tag[['id', 'name']].set_index('id').to_dict()['name']

df_event_tags_short = df_event_tags[['event_id', 'tag_id']]
df_event_tags_pivot = df_event_tags_short.pivot(index="event_id", columns="tag_id", values="tag_id")
df_event_tags_pivot = df_event_tags_pivot.fillna(False)
df_event_tags_pivot[df_event_tags_pivot.columns] = df_event_tags_pivot[df_event_tags_pivot.columns].astype(bool)
df_event_tags_features = df_event_tags_pivot.rename(columns=dict_tag_name)
df_event_tags_features = df_event_tags_features.reset_index()

list_tag_features = df_event_tags_features.columns.values.tolist()[1:]

tag_id,event_id,Schnitzen,Backen,Unsere Erde,Pfa. Geschichte,Unser Bund,1. Hilfe,Feuer machen,Versprechen,Karte Kompass,...,Pfadfinder,Rover,Lernen,Mit Abstand,Speziell zu Ostern,Speziell im Advent,Speziell zu Karneval,Wald,Ausflug,Gesellschaftliches
0,1,False,True,False,False,False,False,True,False,False,...,True,True,True,True,False,False,False,True,False,False
1,2,False,False,False,True,True,False,False,False,False,...,True,True,True,True,False,False,False,False,False,False
2,3,False,False,False,False,False,True,False,False,False,...,True,True,True,False,False,False,False,True,False,False
3,4,False,False,False,False,False,False,False,False,False,...,True,True,True,True,False,False,False,False,False,False
4,5,True,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,209,False,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,False
146,210,False,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,False
147,211,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False
148,212,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [98]:
df_features_joined = pd.merge(df_features, df_event_tags_features, left_on=['id'], right_on= ['event_id'], how = 'left')



df_features_joined = df_features_joined.drop(columns=['description', 'imageLink', 'createdBy', 'species_pos', 'event_id'])

df_features_joined_num = df_features_joined.drop(columns=['title', 'created_at', 'text', 'total_words', 'total_nomen', 'total_adjektive', 'total_numerisch', 'total_satzzeichen', 'total_konjunktion', 'p_tag', 'img_tag', 'b_tag', 'li_tag', 'span_tag', 'a_tag', 'u_tag', 'costsRating', 'executionTimeRating', 'isPrepairationNeeded'])
df_features_num = df_features_joined_num.astype(float)

In [99]:
df_features_joined.to_parquet('data/feature_vector.parquet', coerce_timestamps="us")

df_features_num.to_parquet('data/feature_vector_num.parquet', coerce_timestamps="us")