<a href="https://colab.research.google.com/github/pds2122/capstone-project-kabobe/blob/main/preprocessing_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the identical preprocessing steps as "preprocessing". Here, only the mothods are saved. No data import or execution, as the notebook has to be run before importing.

# Install, import & download 

In [None]:
!pip install ndjson --quiet
!pip install beautifulsoup4 --upgrade --quiet
!pip install html2text --quiet
!pip install nltk --quiet
!pip install HanTa --quiet
!pip install langdetect --quiet

[K     |████████████████████████████████| 97 kB 4.4 MB/s 
[K     |████████████████████████████████| 1.5 MB 26.8 MB/s 
[K     |████████████████████████████████| 981 kB 25.1 MB/s 
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [None]:
import re
import gzip
import json
import nltk
import ndjson
import requests
import pandas as pd
from pathlib import Path
from langdetect import detect
from bs4 import BeautifulSoup
from textblob import TextBlob
from google.colab import drive
from nltk.corpus import stopwords
from HanTa import HanoverTagger as ht
from urllib.parse import urlsplit, urlunsplit

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Methods for Preprocessing

In [None]:
def get_page_source_code(url):
    # route to starting page of website
    split_url = urlsplit(url)
    clean_path = "".join(split_url.scheme+"://"+split_url.netloc+"/")

    # get html content
    url = str(clean_path)
    r = requests.get(url)
    return r.text


def get_pure_text(soup):
    return clean_text(soup.text)


def get_lang_code(pure_text):
    lang_code = ''
    try:
        lang_code = detect(pure_text)
    except:
        lang_code = 'NaN'
    return lang_code


def get_img_alt(soup):
    img_alt = ''
    retrieved_imgs = soup.findAll('img', alt = True)
    for i in range(len(retrieved_imgs)):
        alt = retrieved_imgs[i]
        img_alt = img_alt + ' ' + alt['alt']
    return img_alt


def concatenate_columns(df):
  df['concatenated'] = df[df.columns[:]].apply(
      lambda column: ' '.join(column.dropna().astype(str)),
      axis=1
  )
  return df


def get_sentiment(pure_text):
  return round(TextBlob(pure_text).sentiment.polarity,2)

In [None]:
# stop word customizing
EXTENTION_STOPWORDS = [
    'facebook', 'w', 'm', 'd', 'instagram', 'youtube', 'xing', 'linkedin', 
    'twitter', 'snapchat', 'mehr', 'dafür', 'beim', 'davon', 'somit'
]

In [None]:
def clean_text(text):
    
    # convert words to lower case
    content = text.lower()
    
    content = re.sub(r'&amp;', '', content) 
    content = re.sub(r'[_"\-;%()–|„”®+&=¤*%.™,“<>©!’€?:#$@\[\]/]', ' ', content)
    content = re.sub(r'<br />', ' ', content)
    content = re.sub(r'\'', ' ', content)
    content = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", content)
    content = re.sub(r'[0-9]', ' ', content)
    content= content.replace('{', '')
    content= content.replace('}', '')
    
    # remove stopwords
    content = content.split()
    stops = stopwords.words('german')
    # Extend standard stopwords with custom.
    stops.extend(EXTENTION_STOPWORDS)
    stops = set(stops)
    content = [w for w in content if not w in stops]
    content = ' '.join(content)

    # tokenize each word
    content =  nltk.WordPunctTokenizer().tokenize(content)
    
    # lemmatize each token in German (reduce words to stem)
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
    word_list = []
    for w in content:
        lemma = [lemma for (word,lemma,pos) in tagger.tag_sent(w.split())]
        word_list.append(' '.join(lemma))
    return str.lower(' '.join(word_list))

In [None]:
def get_features_from_url(url, feature_list):
    page_source_code = get_page_source_code(url)
    soup = BeautifulSoup(''.join(page_source_code))
    feature_dict = {}
    for feature in feature_list:
        retrieved_features = soup.findAll(feature)
        for i in range(len(retrieved_features)):
            if i == 0:
                feature_dict[feature] = retrieved_features[i].text
            else:
                feature_dict[feature] = feature_dict[feature] + ' ' + retrieved_features[i].text
    if 'pure_text' in feature_list:
        feature_dict['pure_text'] = get_pure_text(soup)
    if 'lang_code' in feature_list:
        feature_dict['lang_code'] = get_lang_code(feature_dict['pure_text'])
    if 'img_alt' in feature_list:
        feature_dict['img_alt'] = get_img_alt(soup)
    if 'sentiment_analysis' in feature_list:
        feature_dict['sentiment_analysis'] = get_sentiment(feature_dict['pure_text'])
    return feature_dict

In [None]:
def execute_preprocessing(url, feature_list):
  no_pre_cols = ['sentiment_analysis']
  df = pd.DataFrame(get_features_from_url(url, feature_list), index=[0])
  # clean features
  for column in df:
    if column not in no_pre_cols:
      for i in range(len(df)):
        try:
          df[column][i] = clean_text(df[column][i])
        except:
          print(df[column][i])
  df = concatenate_columns(df)
  return df