In [48]:
import pandas as pd
import nltk
import json

# download nltk corpus (first time only)
# nltk.download("all")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
# Installiere das SnowballStemmer für die deutsche Sprache
# nltk.download('stopwords')
# nltk.download('punkt')
analyzer = SentimentIntensityAnalyzer()

## Helper functions

In [43]:
# the columns are beeing merged togheter here
def merge_columns_in_dataframe(df):
    columns_to_merge = ['page_title', 'sub_title', 'introduction', 'summary_box', 'content', 'accordion']
    new_df = df.loc[:, ["url", "page_title"]].copy()
    new_df['text'] = df.apply(lambda row: ' '.join([str(row[col]) for col in columns_to_merge]), axis=1)
    return new_df

In [31]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [
        token for token in tokens if token not in stopwords.words("german")
    ]
    stemmer = SnowballStemmer("german") # eventuell anderen stemmer verwenden
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    processed_text = " ".join(stemmed_tokens)
    return processed_text

In [46]:

def get_sentiment(text_string):
    scores = analyzer.polarity_scores(text_string)
    return pd.Series(scores)

def add_sentiment_cols(dataframe, text_col="text"):
    sentiment_cols = dataframe[text_col].apply(get_sentiment)
    dataframe = pd.concat([dataframe, sentiment_cols], axis=1)
    dataframe['judgement'] = dataframe['compound'].apply(lambda x: 'positive' if x > 0.05 else ("negative" if x < -0.05 else 'neutral'))
    return dataframe

In [53]:
def get_analysis_data(json_file):
    pandas_df = pd.read_json(json_file)
    filtered_df = pandas_df.loc[(pandas_df.loc[:, ~pandas_df.columns.isin(['url', 'page_title'])] != "").any(axis=1)] # exkludiert alle spalten von der Analyse die "" in den spalten ausser "page_title" haben 
    data = merge_columns_in_dataframe(filtered_df)
    data["text"] = data["text"].apply(preprocess_text)
    return add_sentiment_cols(data)

## Execution of Code

In [54]:
display(get_analysis_data("ratgeber_pages_part2.json"))


Unnamed: 0,url,page_title,text,neg,neu,pos,compound,judgement
1,https://www.mobiliar.ch/versicherungen-und-vor...,Selbstunfall mit dem Auto – was nun?,selbstunfall auto – ? schuld ! sekund aufgepas...,0.025,0.975,0.0,-0.5627,negative
2,https://www.mobiliar.ch/versicherungen-und-vor...,Invalidität oder Todesfall: So sorgen Sie vor,"invaliditat todesfall : sorg optimal vorsorg ,...",0.0,0.963,0.037,0.782,positive
3,https://www.mobiliar.ch/versicherungen-und-vor...,Schäden an Ihrer Mietwohnung,schad mietwohn schad haft mieterin miet ? freu...,0.039,0.961,0.0,-0.9749,negative
