In [139]:
import pandas as pd

import spacy
import collections

from bs4 import BeautifulSoup as BS

nlp = spacy.load('de_core_news_sm')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [140]:
df_tag_category = pd.read_parquet('data/tag_category.parquet')
df_tag = pd.read_parquet('data/tag.parquet')

df_event = pd.read_parquet('data/event.parquet')
df_event_tags = pd.read_parquet('data/event_tags.parquet')
df_event_material_list = pd.read_parquet('data/event_material_list.parquet')

df_experiment = pd.read_parquet('data/experiment.parquet')
df_experiment_item = pd.read_parquet('data/experiment_item.parquet')

df_request_log = pd.read_parquet('data/request_log.parquet')

In [141]:
# first Experiment
df_experiment_item = df_experiment_item[df_experiment_item['created_at'] > '2021-03-10']
df_experiment_item = df_experiment_item[df_experiment_item['created_at'] < '2021-03-27']

In [142]:
def num_apperances_of_tag(html, tag):
    soup = BS(html)
    return len(soup.find_all(tag))

def get_text(html):
    soup = BS(html)
    whitelist = [
      'p',
      'li',
      'span'
    ]
    text_elements = [t for t in soup.find_all(text=True) if t.parent.name in whitelist]
    return ' '.join(text_elements)
  
def count_char(string, char):
    return string.count(char)

In [143]:
# html features
df_features = df_event

df_features['p_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('p',))
df_features['img_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('img',))
df_features['b_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('b',))
df_features['li_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('li',))
df_features['span_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('span',))
df_features['a_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('a',))
df_features['u_tag'] = df_features['description'].apply(num_apperances_of_tag, args=('u',))

#
df_features['costsRating_quote'] = df_features['costsRating'] / 5
df_features['executionTimeRating_quote'] = df_features['executionTimeRating'] / 5
df_features['isPrepairationNeeded_quote'] = df_features['isPrepairationNeeded'] / 5

# header image
df_features['header_img'] = pd.notna(df_features['imageLink'])

# text features
df_features['text'] = df_features['description'].apply(get_text)
df_features['count_question_mark'] = df_features['text'].apply(count_char, args=('?',)) / 20
df_features['title_len'] = df_features['title'].str.len() / 40
df_features['text_len'] = df_features['text'].str.len() / 2000

In [144]:
# NLP

In [145]:
dicts_syntc = {
    "ADJ": "Adjektiv",
    "ADP": "Adposition",
    "ADV": "Adverb",
    "AUX": "Hilfsverb",
    "CONJ": "Koordinierende Konjunktionen",
    "DET": "Artikel",
    "INTJ": "Ausruf",
    "NOUN": "Nomen",
    "NUM": "Numerisch",
    "PART": "particle",
    "PRON": "Pronomen",
    "PROPN": "Eigenname",
    "PUNCT": "Satzzeichen",
    "SCONJ": "unterordnende Konjunktion",
    "SYM": "Symbol",
    "VERB": "Verb",
    "CCONJ": "Konjunktion"
}

In [146]:
df_temp = df_features

tokens = []
pos = []

for doc in nlp.pipe(df_features['text'].astype('unicode').values, batch_size=50):
    if doc.is_parsed:
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        pos.append(None)

df_temp['species_pos'] = pos

In [147]:
df_temp['total_words'] = df_temp['species_pos'].apply(lambda x: len(x))
df_temp['total_nomen'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'NOUN' in x]))
df_temp['total_adjektive'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'ADJ' in x]))
df_temp['total_numerisch'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'NUM' in x]))
df_temp['total_satzzeichen'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'PUNCT' in x]))
df_temp['total_konjunktion'] = df_temp['species_pos'].apply(lambda L: len([x for x in L if 'CCONJ' in x]))

df_features['nomen_quote'] = df_temp['total_nomen'] / df_temp['total_words']
df_features['adjektive_quote'] = df_temp['total_adjektive'] / df_temp['total_words']
df_features['numerisch_quote'] = df_temp['total_numerisch'] / df_temp['total_words']
df_features['satzzeichen_quote'] = df_temp['total_satzzeichen'] / df_temp['total_words']
df_features['konjunktion_quote'] = df_temp['total_konjunktion'] / df_temp['total_words']

In [148]:
# Tag Features

In [149]:
dict_tag_name = df_tag[['id', 'name']].set_index('id').to_dict()['name']

df_event_tags_short = df_event_tags[['event_id', 'tag_id']]
df_event_tags_pivot = df_event_tags_short.pivot(index="event_id", columns="tag_id", values="tag_id")
df_event_tags_pivot = df_event_tags_pivot.fillna(False)
df_event_tags_pivot[df_event_tags_pivot.columns] = df_event_tags_pivot[df_event_tags_pivot.columns].astype(bool)
df_event_tags_features = df_event_tags_pivot.rename(columns=dict_tag_name)
df_event_tags_features = df_event_tags_features.reset_index()

list_tag_features = df_event_tags_features.columns.values.tolist()[1:]

In [150]:
list_tag_features

['Schnitzen',
 'Backen',
 'Unsere Erde',
 'Pfa. Geschichte',
 'Unser Bund',
 '1. Hilfe',
 'Feuer machen',
 'Versprechen',
 'Karte Kompass',
 'Kim-Spiele',
 'Symbolik',
 'Knoten',
 'Küche',
 'Schwarzzelte',
 'Musisches',
 'Haik',
 'Baum',
 'Sternenkunde',
 'Handwerk',
 'Spiele',
 'Nachhaltigkeit',
 'Wasser',
 'Basteln',
 'Geschichten',
 'Unsere Sippe',
 'Pflanzen',
 'Forschen',
 'Bewegung',
 'Kreatives',
 'Sommer',
 'Herbst',
 'Frühling',
 'Winter',
 'Im Haus',
 'Garten',
 'Videokonferenz',
 'Alleine',
 'Wölflinge',
 'Pfadfinder',
 'Rover',
 'Lernen',
 'Mit Abstand',
 'Speziell zu Ostern',
 'Speziell im Advent',
 'Speziell zu Karneval',
 'Wald',
 'Ausflug',
 'Gesellschaftliches']

In [153]:
df_features_joined = pd.merge(df_features, df_event_tags_features, left_on=['id'], right_on= ['event_id'], how = 'left')



df_features_joined = df_features_joined.drop(columns=['description', 'imageLink', 'createdBy', 'species_pos', 'event_id'])

df_features_joined_num = df_features_joined.drop(columns=['title', 'created_at', 'text', 'total_words', 'total_nomen', 'total_adjektive', 'total_numerisch', 'total_satzzeichen', 'total_konjunktion', 'p_tag', 'img_tag', 'b_tag', 'li_tag', 'span_tag', 'a_tag', 'u_tag', 'costsRating', 'executionTimeRating', 'isPrepairationNeeded'])
df_features_num = df_features_joined_num.astype(float)

Unnamed: 0,id,title,description,costsRating,executionTimeRating,isPrepairationNeeded,imageLink,created_at,createdBy,p_tag,img_tag,b_tag,li_tag,span_tag,a_tag,u_tag,costsRating_quote,executionTimeRating_quote,isPrepairationNeeded_quote,header_img,text,count_question_mark,title_len,text_len,species_pos,total_words,total_nomen,total_adjektive,total_numerisch,total_satzzeichen,total_konjunktion,nomen_quote,adjektive_quote,numerisch_quote,satzzeichen_quote,konjunktion_quote
0,1,Brot backen auf dem Lagerfeuer,<p>Ihr bekommt die Aufgabe ein Brot ohne Backo...,1,3,1,https://api.xrdcx.de/media/images/44d3e80c-86a...,2021-03-07 20:00:27.660006,Robert,5,0,0,11,0,0,0,0.2,0.6,0.2,True,Ihr bekommt die Aufgabe ein Brot ohne Backofen...,0.0,0.75,0.395,"[PRON, VERB, DET, NOUN, DET, NOUN, ADP, NOUN, ...",132,31,3,0,12,7,0.234848,0.022727,0.0,0.090909,0.05303
1,2,Bist du Bündisch oder Scoutistisch?,<p>Ihr k&ouml;nnt verschiedene Situationen aus...,0,1,0,https://api.xrdcx.de/media/images/3a63be00-7f8...,2021-03-07 20:00:27.660006,Robert,1,0,0,6,0,0,0,0.0,0.2,0.0,True,Ihr könnt verschiedene Situationen aus dem Sta...,0.3,0.875,0.1975,"[PRON, VERB, ADJ, NOUN, ADP, DET, NOUN, VERB, ...",68,12,3,0,8,3,0.176471,0.044118,0.0,0.117647,0.044118
2,3,Behelfstrage aus Naturmaterialien bauen,<p>Ihr k&ouml;nnt in den Wald gehen und dort a...,0,3,0,,2021-03-07 20:00:27.660006,Robert,4,0,0,5,0,0,0,0.0,0.6,0.0,False,Ihr könnt in den Wald gehen und dort aus z.B: ...,0.05,0.975,0.164,"[PRON, VERB, ADP, DET, NOUN, VERB, CCONJ, ADV,...",57,12,3,0,5,2,0.210526,0.052632,0.0,0.087719,0.035088
3,4,Gepäckteile besprechen und wiegen,<p>Ihr kommt mit einem gepackten Rucksack zum ...,0,3,1,,2021-03-07 20:00:27.660006,Robert,3,0,0,6,0,0,0,0.0,0.6,0.2,False,Ihr kommt mit einem gepackten Rucksack zum Hei...,0.25,0.825,0.1475,"[PRON, VERB, ADP, DET, ADJ, NOUN, ADP, NOUN, P...",57,6,2,0,8,1,0.105263,0.035088,0.0,0.140351,0.017544
4,5,Sippen-Holzbrettchen erstellen,<p>Ihr k&ouml;nnt blanke Holzbrettchen g&uuml;...,2,3,1,,2021-03-07 20:00:27.660006,Robert,6,0,0,0,0,0,0,0.4,0.6,0.2,False,Ihr könnt blanke Holzbrettchen güstig kaufen u...,0.0,0.75,0.161,"[PRON, VERB, ADJ, NOUN, ADV, VERB, CCONJ, ADP,...",48,11,3,0,3,4,0.229167,0.0625,0.0,0.0625,0.083333
5,6,Blattbuch erstellen,<p>Ihr sollt ein Blattbuch erstellen.</p>\n<p>...,1,3,1,https://api.xrdcx.de/media/images/15f62406-86a...,2021-03-07 20:00:27.660006,Robert,6,0,0,0,0,0,0,0.2,0.6,0.2,True,Ihr sollt ein Blattbuch erstellen. Dafür sollt...,0.05,0.475,0.185,"[PRON, VERB, DET, NOUN, VERB, PUNCT, ADV, VERB...",60,12,3,0,8,4,0.2,0.05,0.0,0.133333,0.066667
6,8,Wildpflanzen in der Umgebung suchen,<p>Ihr sollt soviele unterschiedliche Wildpfla...,0,3,0,https://api.xrdcx.de/media/images/c17391ee-7f8...,2021-03-07 20:00:27.660006,Robert,2,0,0,12,0,0,0,0.0,0.6,0.0,True,Ihr sollt soviele unterschiedliche Wildpflanze...,0.0,0.875,0.178,"[PRON, VERB, DET, ADJ, NOUN, PUNCT, NOUN, PUNC...",48,13,4,0,10,2,0.270833,0.083333,0.0,0.208333,0.041667
9,11,Sonnenuhr bauen,<p>Ihr sollt eine Sonnenuhr bauen. Daf&uuml;r ...,1,2,1,,2021-03-07 20:00:27.660006,Robert,2,0,0,5,0,0,0,0.2,0.4,0.2,False,Ihr sollt eine Sonnenuhr bauen. Dafür steckt i...,0.05,0.375,0.1835,"[PRON, VERB, DET, NOUN, VERB, PUNCT, ADV, VERB...",62,15,2,0,7,3,0.241935,0.032258,0.0,0.112903,0.048387
10,12,Gewürze erschmecken,<p>Um ein guter <strong>Sippenkoch</strong> zu...,1,1,1,https://api.xrdcx.de/media/images/56cd3cae-8a8...,2021-03-07 20:00:27.660006,Inspi,7,0,0,26,0,0,0,0.2,0.2,0.2,True,"Um ein guter zu werden, musst du dein Essen ...",0.05,0.475,0.487,"[SCONJ, DET, ADJ, SPACE, PART, AUX, PUNCT, VER...",186,34,8,2,29,3,0.182796,0.043011,0.010753,0.155914,0.016129
12,33,Kritische Pfadfinderfragen (Rollenspiel),<p>Du kannst ein paar kritische allt&auml;glic...,0,1,0,,2021-03-07 20:00:27.660006,Robert,2,0,0,5,0,0,0,0.0,0.2,0.0,False,Du kannst ein paar kritische alltägliche Pfadf...,0.2,1.0,0.194,"[PRON, ADV, DET, DET, ADJ, ADJ, NOUN, VERB, CC...",66,11,2,0,16,1,0.166667,0.030303,0.0,0.242424,0.015152


In [152]:
df_features_joined.to_parquet('data/feature_vector.parquet', coerce_timestamps="us")

df_features_num.to_parquet('data/feature_vector_num.parquet', coerce_timestamps="us")