<a href="https://colab.research.google.com/github/pds2122/capstone-project-kabobe/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install, import & download

In [1]:
!pip install ndjson --quiet
!pip install beautifulsoup4 --quiet
!pip install html2text --quiet
!pip install nltk --quiet
!pip install HanTa --quiet
!pip install langdetect --quiet

[K     |████████████████████████████████| 1.5 MB 18.3 MB/s 
[K     |████████████████████████████████| 981 kB 25.5 MB/s 
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [6]:
import re
import gzip
import json
import nltk
import ndjson
import requests
import pandas as pd
from pathlib import Path
from langdetect import detect
from bs4 import BeautifulSoup
from textblob import TextBlob
from google.colab import drive
from nltk.corpus import stopwords
from collections import OrderedDict, Counter
from HanTa import HanoverTagger as ht
from urllib.parse import urlsplit, urlunsplit

In [None]:
# download additional stopwords and wordnet to use for German
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Load Datasets

In [3]:
# Mount Google Drive
drive.mount('/gdrive')
# Define file paths
data_path = Path('/gdrive/MyDrive/industry_data/')
test_file = 'test_small.ndjson.gz'
train_file = 'train_small.ndjson.gz'

Mounted at /gdrive


In [None]:
with gzip.open(data_path/train_file, "rt", encoding='UTF-8') as file:
    data = []
    data = [json.loads(line) for line in file]

# get nested list, create flat-list, to fet data in DataFrame
flat_list = [item for sublist in data for item in sublist]
df_train = pd.DataFrame(flat_list)

In [4]:
with gzip.open(data_path/test_file, "rt", encoding='UTF-8') as file:
    data = []
    data = [json.loads(line) for line in file]
    df_test = pd.DataFrame(data)

In [None]:
# loading as function - used in the mean time of preprocessing in order to continue 
def data_reloader_from_zip(file_name):
  with gzip.open(file_name, 'rt', encoding='UTF-8') as file:
      data = []
      data = [ndjson.loads(line.strip()) for line in file]

  flat_list = [item for sublist in data for item in sublist]
  df = pd.DataFrame(flat_list)

  return df

def data_reloader_from_ndjson(file_name):
  with open(file_name, 'rt', encoding='UTF-8') as file:
      data = []
      data = [ndjson.loads(line.strip()) for line in file]

  flat_list = [item for sublist in data for item in sublist]
  df = pd.DataFrame(flat_list)

  return df

## Save Datasets

In [None]:
# save to ndjson (either regular or gzip)

def datasaver_to_zip(df, name):
  # create flat list in dict form: {'col1': 'value', 'col2': 'value', ...} from df
  flat_list_back = []
  for i in range(len(df)):
    line = df.loc[i].to_dict()
    #line['industry'] = str(line['industry']) # use if idustry number (e.g. 13) should be enclosed in '' (e.g. '13')
    flat_list_back.append([line])

  filename_zip = str(name) + '.ndjson.gz'

  with gzip.open(filename_zip, 'wt', encoding='UTF-8') as z:
    for item in flat_list_back:
      z.write('{}\n'.format(ndjson.dumps(item)))


def datasaver_to_ndjson(df, name):
  # create flat list in dict form: {'col1': 'value', 'col2': 'value', ...} from df
  flat_list_back = []
  for i in range(len(df)):
    line = df.loc[i].to_dict()
    #line['industry'] = str(line['industry']) # use if idustry number (e.g. 13) should be enclosed in '' (e.g. '13')
    flat_list_back.append([line])

  filename = str(name) + '.ndjson'

  # https://stackoverflow.com/questions/21058935/python-json-loads-shows-valueerror-extra-data
  with open(filename, mode='w') as f:
    for item in flat_list_back:
      f.write('{}\n'.format(ndjson.dumps(item))) 

## Analyse Dataset

Look at occurences of tags

In [None]:
all_tags = [] 

# of the first 1000 rows - alternatively choose len(df_train)
for i in range(1000): #len(df_train)
  soup = BeautifulSoup(data[i][0]['html'], 'html.parser')
  #for tag in soup.findAll(True):
    #print(tag.name)
  tags = set(tag.name for tag in BeautifulSoup(data[i][0]['html'], 'html.parser').find_all()) # if eliminating set, you get the sum of all occurences
  all_tags.extend(tags)

In [None]:
counted = Counter(all_tags)
OrderedDict(counted.most_common())

# Methods for Preprocessing

In [None]:
def get_pure_text(soup):
    return clean_text(soup.text)


def get_lang_code(pure_text):
    lang_code = ''
    try:
        lang_code = detect(pure_text)
    except:
        lang_code = 'NaN'
    return lang_code


def get_img_alt(soup):
    img_alt = ''
    retrieved_imgs = soup.findAll('img', alt = True)
    for i in range(len(retrieved_imgs)):
        alt = retrieved_imgs[i]
        img_alt = img_alt + ' ' + alt['alt']
    return img_alt


def concatenate_columns(df):
  df['concatenated'] = df[df.columns[3:]].apply(
      lambda column: ' '.join(column.dropna().astype(str)),
      axis=1
  )
  return df


def get_sentiment(pure_text):
  return round(TextBlob(pure_text).sentiment.polarity,2)

During the preprocessing we noticed multiple stopwords which have not been removed by the standard stopwords. 
<br /> Therefore we customized them:

In [None]:
EXTENTION_STOPWORDS = [
    'facebook', 'w', 'm', 'd', 'instagram', 'youtube', 'xing', 'linkedin', 
    'twitter', 'snapchat', 'mehr', 'dafür', 'beim', 'davon', 'somit'
]

In [None]:
def clean_text(text):

    if text == 'nan':
      return
    
    # https://medium.com/analytics-vidhya/applying-text-classification-using-logistic-regression-a-comparison-between-bow-and-tf-idf-1f1ed1b83640
    # convert words to lower case
    content = text.lower()
    
    content = re.sub(r'&amp;', '', content) 
    content = re.sub(r'[_"\-;%()–|„”®+&=¤*%.™,“!’€?:#$@\[\]/]', ' ', content)
    content = re.sub(r'<br />', ' ', content)
    content = re.sub(r'\'', ' ', content)
    content = re.sub("^\d+\s|\s\d+\s|\s\d+${}", " ", content)
    content = re.sub(r'[0-9]', ' ', content)
    content = content.replace('{', '')
    content = content.replace('}', '')
    
    
    # remove stopwords
    content = content.split()
    stops = stopwords.words('german')
    # Extend standard stopwords with custom.
    stops.extend(EXTENTION_STOPWORDS)
    stops = set(stops)
    content = [w for w in content if not w in stops]
    content = ' '.join(content)

    # tokenize each word
    content =  nltk.WordPunctTokenizer().tokenize(content)
    
    # lemmatize each token in German (reduce words to stem)
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
    word_list = []
    for w in content:
        lemma = [lemma for (word,lemma,pos) in tagger.tag_sent(w.split())]
        word_list.append(' '.join(lemma))
    return str.lower(' '.join(word_list))

In [None]:
def get_features_from_html(df, feature_list):
  
  # Add features as new empty columns
  df = pd.concat([df,pd.DataFrame(columns=feature_list)]) #df.reindex(columns=list('ABCD'), fill_value=0)
  for row in range(len(df)):
    soup = BeautifulSoup(df.html[row])
    for feature in feature_list:
        retrieved_features = soup.findAll(feature)
        for i in range(len(retrieved_features)):
            if i == 0:
                df[feature][row] = retrieved_features[i].text
            else:
                df[feature][row] = df[feature][row] + ' ' + retrieved_features[i].text
    if 'pure_text' in feature_list:
      df['pure_text'][row] = get_pure_text(soup)
    if 'lang_code' in feature_list:
      df['lang_code'][row] = get_lang_code(df['pure_text'][row])
    if 'img_alt' in feature_list:
      df['img_alt'][row] = get_img_alt(soup)
    if 'sentiment_analysis' in feature_list:
      df['sentiment_analysis'][row] = get_sentiment(df['pure_text'][row])
  return df

In [None]:
# union of all processes so that only one function can be called
def execute_preprocessing(df, feature_list):

  no_pre_cols = ['url', 'html', 'industry', 'industry_label', 'sentiment_analysis']
  df = get_features_from_html(df, feature_list)

  # clean features
  for column in df:
    if column not in no_pre_cols:
      for i in range(len(df)):
        try:
          df[column][i] = clean_text(df[column][i])
        except:
          print(df[column][i])
  df = concatenate_columns(df)
  return df

# Execution

In [None]:
feature_list = ['title', 'h1', 'h2', 'h3', 'figcaption', 'pure_text', 'img_alt', 'strong', 'bold', 'lang_code', 'sentiment_analysis']
df_preprocessed = execute_preprocessing(df_train, feature_list)
df_preprocessed.head()

#Archive

### With the following code, we prepared the train- and test-data. we decided to make this code a little nicer. see above

get the whole text between tags

In [None]:
def parse_to_text(html):
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

In [None]:
# eliminate html elements from text, return text elements
# 21 minutes for train dataset

# assign column for new text
df_train['html_to_text'] = ''

for line in range(0, len(df_train)):
  content = parse_to_text(df_train.html[line])
  df_train.html_to_text[line] = content


# on test dataset
# 7 minutes

df_test['html_to_text'] = ''

for line in range(0, len(df_test)):
  content = parse_to_text(df_test.html[line])
  df_test.html_to_text[line] = content

**"clean text" method is the same!**

specific tags and features

In [None]:
def getHTML(url):
    return  BeautifulSoup(url, 'html.parser')

## Img-Description from IMG-Tag
def getImgDescriptionHTMLtag(url):
    soup = getHTML(url)

    results = soup.find_all('img', alt = True)
    img_description = []
    for x in range(0,len(results)):
      first_result = results[x]
      img_description.append(first_result['alt'])
    
    return list(filter(None, img_description))

## Title
def getTitleHTMLtag(url):
    soup = getHTML(url)

    if (soup.title is not None):
        return str(soup.title.string)
    else:
        return ""

## h1
def getH1HTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('h1')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

## h2
def getH2HTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('h2')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

## h3
def getH3HTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('h3')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

## strong - fragwürdig
def getStrongHTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('strong')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

## bold
def getBoldHTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('bold')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

## language code
def getLangHTMLtag(url):
    
    try:
      soup = getHTML(url)
      body_text = soup.body.get_text()
      return detect(body_text)
    
    except:
      return str("NaN")
    
## figcaption
def getFigCaptionHTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('figcaption')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

In [None]:
# fill extra features

def retrieve_features(df):
    for i in range (4, 13):
        for j in range(0, len(df)):
            if i == 4:
              df.iloc[:, i][j] = getImgDescriptionHTMLtag(df.html[j])
            elif i == 5:
              df.iloc[:, i][j] = getTitleHTMLtag(df.html[j])
            elif i == 6:
              df.iloc[:, i][j] = getH1HTMLtag(df.html[j])
            elif i == 7:
              df.iloc[:, i][j] = getH2HTMLtag(df.html[j])
            elif i == 8:
              df.iloc[:, i][j] = getH3HTMLtag(df.html[j])
            elif i == 9:
              df.iloc[:, i][j] = getStrongHTMLtag(df.html[j])
            elif i == 10:
              df.iloc[:, i][j] = getBoldHTMLtag(df.html[j])
            elif i == 11:
              df.iloc[:, i][j] = getLangHTMLtag(df.html[j])
            elif i == 12:
              df.iloc[:, i][j] = getFigCaptionHTMLtag(df.html[j])

In [None]:
# convert list elements to string, in order to clean text in each column
def convert_features_toString(df):

    for x in range(len(df)):
        df.img_alt[x] = ' '.join(df.img_alt[x])
        df.title[x] = ' '.join(df.title[x])
        df.h1[x] = ' '.join(df.h1[x])
        df.h2[x] = ' '.join(df.h2[x])
        df.h3[x] = ' '.join(df.h3[x])
        df.strong[x] = ' '.join(df.strong[x])
        df.bold[x] = ' '.join(df.bold[x])
        df.figcaption[x] = ' '.join(df.figcaption[x])

In [None]:
def convert_features_toString_old(df):
    # img
    for x in range(len(df)):
        df.img_alt[x] = ' '.join(df.img_alt[x])

    # title
    for x in range(len(df)):
        df.title[x] = ' '.join(df.title[x])

    # h1
    for x in range(len(df)):
        df.h1[x] = ' '.join(df.h1[x])

    # h2
    for x in range(len(df)):
        df.h2[x] = ' '.join(df.h2[x])

    # h3
    for x in range(len(df)):
        df.h3[x] = ' '.join(df.h3[x])

    # strong
    for x in range(len(df)):
        df.strong[x] = ' '.join(df.strong[x])

    # bold
    for x in range(len(df)):
        df.bold[x] = ' '.join(df.bold[x])

    # figcaption
    for x in range(len(df)):
        df.figcaption[x] = ' '.join(df.figcaption[x])

## Welche nehmen wir hier? Irgendwie funktioniert das nur, wenn ich es außerhalb der Funktion mache - nur bei mir so? wieso?

In [None]:
# alle extra-features durch clean_text schicken, ohne lang-feature
def clean_dataframe(df):
    columns = ['img_alt','title','h1','h2','h3','strong','bold','figcaption']
    for x in columns:
        for y in range(len(df)):
            df[x][y] = clean_text(df[x][y])