<a href="https://colab.research.google.com/github/princessivy/course/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installs & Imports

In [None]:
!pip install ndjson --quiet
!pip install beautifulsoup4 --quiet
!pip install html2text --quiet
!pip install nltk --quiet
!pip install HanTa --quiet
!pip install langdetect --quiet

In [None]:
import ndjson
import requests
import pandas as pd
from bs4 import BeautifulSoup
import gzip
from pathlib import Path
# Uncomment the follwoing line if working in Google Colab 
# from google.colab import drive
from collections import Counter, OrderedDict
import html2text
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from HanTa import HanoverTagger as ht
from langdetect import detect
import gc
gc.enable()

#for sentiment analysis
from textblob import TextBlob


# Dataloader 
### Test-Dataset

In [None]:
# Mount Google Drive
drive.mount('/gdrive')
data_path = Path('/gdrive/MyDrive/industry_data/')
file_name = 'test_small.ndjson.gz'

with gzip.open(data_path/file_name, "rt", encoding='UTF-8') as file:
    data = ndjson.load(file)
df_test = pd.DataFrame(data)

Mounted at /gdrive


In [None]:
   
    # check for null entries
if df_test.isnull().any(axis=None):
    print('\nPreview of data with null values:\nxxxxxxxxxxxxx')
    print(df_test[df_test.isnull().any(axis=1)].head(3))
    #missingno.matrix(df_test)
    #plt.show()
else:
  print('No null entries found')

In [None]:
# check for null entries
if df_test.isnull().any(axis=None):
    print('\nPreview of data with null values:\nxxxxxxxxxxxxx')
    print(df_test[df_test.isnull().any(axis=1)].head(3))
    #missingno.matrix(df_test)
    #plt.show()
else:
  print('No null entries found')

No null entries found


In [None]:
# generate count statistics of duplicate entries
if len(df_test[df_test.duplicated()]) > 0:
    print('Number of duplicated entries: ', len(df_test[df_test.duplicated()]))
    print(df_test[df_test.duplicated(keep=False)].sort_values(by=list(df_test.columns)).head())
else:
    print('No duplicated entries found')

No duplicated entries found


### Train-Dataset

In [None]:
drive.mount('/gdrive')
data_path = Path('/gdrive/MyDrive/industry_data/')
file_name = 'train_small.ndjson.gz'
with gzip.open(data_path/file_name, "rt", encoding='UTF-8') as file:
    data = []
    data = [ndjson.loads(line) for line in file]

In [None]:
# Nested List rausholen, Flat-List erzeugen, um Daten in DataFrame zu bekommen
flat_list = [item for sublist in data for item in sublist]
df_train = pd.DataFrame(flat_list)

In [None]:
# check for null entries
if df_train.isnull().any(axis=None):
    print('\nPreview of data with null values:\nxxxxxxxxxxxxx')
    print(df_train[df_train.isnull().any(axis=1)].head(3))
    #missingno.matrix(df_train)
    #plt.show()
else:
  print('No null entries found')

No null entries found


In [None]:
# generate count statistics of duplicate entries
if len(df_train[df_train.duplicated()]) > 0:
    print('Number of duplicated entries: ', len(df_train[df_train.duplicated()]))
    print(df_train[df_train.duplicated(keep=False)].sort_values(by=list(df_train.columns)).head())
else:
    print("No duplicated entries found")

# "Datasaver" & "data-reloader"
### To ndjson and ndjson.gz

In order to save preprocessed data. If it is not saved row by row the run time crashes.

In [None]:
# save to ndjson (either regular or gzip)

def datasaver_to_zip(df, name):
  # create flat list in dict form: {'col1': 'value', 'col2': 'value', ...} from df
  flat_list_back = []
  for i in range(len(df)):
    line = df.loc[i].to_dict()
    #line['industry'] = str(line['industry']) # use if idustry number (e.g. 13) should be enclosed in '' (e.g. '13')
    flat_list_back.append([line])

  filename_zip = str(name) + '.ndjson.gz'

  with gzip.open(filename_zip, 'wt', encoding='UTF-8') as z:
    for item in flat_list_back:
      z.write('{}\n'.format(ndjson.dumps(item)))

def datasaver_to_ndjson(df, name):
  # create flat list in dict form: {'col1': 'value', 'col2': 'value', ...} from df
  flat_list_back = []
  for i in range(len(df)):
    line = df.loc[i].to_dict()
    #line['industry'] = str(line['industry']) # use if idustry number (e.g. 13) should be enclosed in '' (e.g. '13')
    flat_list_back.append([line])

  filename = str(name) + '.ndjson'

  # https://stackoverflow.com/questions/21058935/python-json-loads-shows-valueerror-extra-data
  with open(filename, mode='w') as f:
    for item in flat_list_back:
      f.write('{}\n'.format(ndjson.dumps(item))) 

In [None]:
def data_reloader_from_zip(file_name):
  with gzip.open(file_name, 'rt', encoding='UTF-8') as file:
      data = []
      data = [ndjson.loads(line.strip()) for line in file]

  flat_list = [item for sublist in data for item in sublist]
  df = pd.DataFrame(flat_list)

  return df

def data_reloader_from_ndjson(file_name):
  with open(file_name, 'rt', encoding='UTF-8') as file:
      data = []
      data = [ndjson.loads(line.strip()) for line in file]

  flat_list = [item for sublist in data for item in sublist]
  df = pd.DataFrame(flat_list)

  return df

In [None]:
# save file to drive 
# make sure to have folder connected (use url for access and create link for your own drive)
# Acess: https://drive.google.com/drive/folders/1qR-9z3uFmp5Nvsb_1QrR9lU8yNE8hi6l?usp=sharing
drive.mount('/gdrive')

!cp test_html_to_text.ndjson.gz "/gdrive/MyDrive/industry_data_processed/" # exchange file name

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# HTML Feature-Checkout

## Checking out Tag-Occurence

In [None]:
all_tags = [] #der ersten 1000 Datensätze

for i in range(1000): #len(df_train)
  soup = BeautifulSoup(data[i][0]['html'], 'html.parser')
  #for tag in soup.findAll(True):
    #print(tag.name)
  tags = set(tag.name for tag in BeautifulSoup(data[i][0]['html'], 'html.parser').find_all())
  all_tags.extend(tags)

  #print(soup.get_text()[:1024])
  #print(tags)

In [None]:
# count all text, print sorted by most occurences
counted = Counter(all_tags)
OrderedDict(counted.most_common())

## Get the whole Text between Tags

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
def parse_to_text(html):
    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

In [None]:
# eliminate html elements from text, return text elements
# 21 minutes for train dataset

# assign column for new text
df_train = df_train.assign(html_to_text='')

for line in range(0, len(df_train)):
  content = parse_to_text(df_train.html[line])
  df_train.html_to_text[line] = content

In [None]:
# on test dataset
# 7 minutes

df_test['html_to_text'] = ''

for line in range(0, len(df_test)):
  content = parse_to_text(df_test.html[line])
  df_test.html_to_text[line] = content

In [None]:
# duplicate to keep working with original df (if necessary)
df_test_html_to_text = df_test.copy()
df_test_html_to_text=df_test_html_to_text.drop(columns='html')

In [None]:
gc.collect()

665

### Preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# https://medium.com/analytics-vidhya/applying-text-classification-using-logistic-regression-a-comparison-between-bow-and-tf-idf-1f1ed1b83640

# hier könnten wir sprachenabhängig arbeiten: clean_text_german(), clean_text_english() und anhand des lang-tags anwenden
 
def clean_text(mixed_text):
    '''Text Preprocessing '''
    
    # convert words to lower case
    content = mixed_text.lower()
    
    # ENGLISH use this for english text
    # Expand contractions (you've -> you have)
    #if True:
    #    text = text.split()
    #    new_text = []
    #    for word in text:
    #        if word in contractions:
    #            new_text.append(contractions[word])
    #        else:
    #            new_text.append(word)
    #    text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    #content = re.sub(r'https?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE) # brauchen wir nicht mehr, da schon geparst
    #content = re.sub(r'\<a href', ' ', content)
    content = re.sub(r'&amp;', '', content) 
    content = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', content)
    content = re.sub(r'<br />', ' ', content)
    content = re.sub(r'\'', ' ', content)
    
    # remove stopwords
    content = content.split()
    stops = set(stopwords.words('german'))
    content = [w for w in content if not w in stops]
    content = ' '.join(content)

    # tokenize each word
    content =  nltk.WordPunctTokenizer().tokenize(content)
    
    # lemmatize each token in German (reduce words to stem)
    tagger = ht.HanoverTagger('morphmodel_ger.pgz')
    word_list = []
    for w in content:
        lemma = [lemma for (word,lemma,pos) in tagger.tag_sent(w.split())]
        word_list.append(' '.join(lemma))

    # ENGLISH use this for english text
    # lemmatize each token
    #lemm = nltk.stem.WordNetLemmatizer()
    #content = list(map(lambda word:list(map(lemm.lemmatize, word)), content))
    
    return word_list

In [None]:
df_test_html_to_text = data_reloader_from_zip('test_html_to_text.ndjson.gz')

In [None]:
df_test_html_to_text['html_cleaned'] = ''

In [None]:
len(df_test_html_to_text)

8396

In [None]:
for line in range(4000):
    content = clean_text(df_test_html_to_text['html_to_text'][line])
    print(line) #debugging
    df_test_html_to_text.html_cleaned[line] = content

In [None]:
datasaver_to_zip(df_test_html_to_text, 'df_test_cleaned_text')

## Specific Tags for additional Features

In [None]:
def getHTML(url):
    # später anpassen, wenn wir live-url abfragen! evtl. Fallunterscheidung?!
    '''    r = requests.get(url)
    r.text'''
    return  BeautifulSoup(url, 'html.parser')



## Img-Description from IMG-Tag
def getImgDescriptionHTMLtag(url):
    soup = getHTML(url)

    results = soup.find_all('img', alt = True)
    img_description = []
    for x in range(0,len(results)):
      first_result = results[x]
      img_description.append(first_result['alt'])
    
    return list(filter(None, img_description))


## Title
def getTitleHTMLtag(url):
    soup = getHTML(url)

    if (soup.title is not None):
        return str(soup.title.string)
    else:
        return ""


## h1
def getH1HTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('h1')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))


## h2
def getH2HTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('h2')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))


## h3
def getH3HTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('h3')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))


## strong - fragwürdig
def getStrongHTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('strong')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))


## bold
def getBoldHTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('bold')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))
  

## language code
def getLangHTMLtag(url):
    
    try:
      soup = getHTML(url)
      body_text = soup.body.get_text()
      return detect(body_text)
    
    except:
      return str("NaN")
    
    
## figcaption
def getFigCaptionHTMLtag(url):
    soup = getHTML(url)

    heading = soup.findAll('figcaption')
    n = len(heading)

    liste = []
    for x in range(n):
      liste.append(str.strip(heading[x].text))

    return list(filter(None, liste))

In [None]:
df_test = df_test.assign(img_alt='', title='', h1='', h2='', h3='', strong='', bold='', lang_code='', figcaption='')

In [None]:
# Befüllen der Extra-Features

def retrieve_features(df):
    for i in range (4, 13):
        for j in range(0, len(df)):
            if i == 4:
              df.iloc[:, i][j] = getImgDescriptionHTMLtag(df.html[j])
            elif i == 5:
              df.iloc[:, i][j] = getTitleHTMLtag(df.html[j])
            elif i == 6:
              df.iloc[:, i][j] = getH1HTMLtag(df.html[j])
            elif i == 7:
              df.iloc[:, i][j] = getH2HTMLtag(df.html[j])
            elif i == 8:
              df.iloc[:, i][j] = getH3HTMLtag(df.html[j])
            elif i == 9:
              df.iloc[:, i][j] = getStrongHTMLtag(df.html[j])
            elif i == 10:
              df.iloc[:, i][j] = getBoldHTMLtag(df.html[j])
            elif i == 11:
              df.iloc[:, i][j] = getLangHTMLtag(df.html[j])
            elif i == 12:
              df.iloc[:, i][j] = getFigCaptionHTMLtag(df.html[j])

In [None]:
# aus List-Elementen in DataFrame einfache Strings machen, um besser cleanen zu können
def convert_features_toString(df):
    # img
    for x in range(len(df)):
        df.img_alt[x] = ' '.join(df.img_alt[x])

    # h1
    for x in range(len(df)):
        df.h1[x] = ' '.join(df.h1[x])

    # h2
    for x in range(len(df)):
        df.h2[x] = ' '.join(df.h2[x])

    # h3
    for x in range(len(df)):
        df.h3[x] = ' '.join(df.h3[x])

    # strong
    for x in range(len(df)):
        df.strong[x] = ' '.join(df.strong[x])

    # bold
    for x in range(len(df)):
        df.bold[x] = ' '.join(df.bold[x])

    # figcaption
    for x in range(len(df)):
        df.figcaption[x] = ' '.join(df.figcaption[x])

In [None]:
# alle extra-features durchgehen um zu gucken, welche wir noch durch clean_text laufen lassen müssen
columns = df_train_reloaded.columns.tolist()
columns = columns[4:13]

In [None]:
# lassen lang-code weg
columns = ['img_alt','title','h1','h2','h3','strong','bold','figcaption']

In [None]:
# alle extra-features durch clean_text schicken, ohne lang-feature
def clean_dataframe(df):
    columns = ['img_alt','title','h1','h2','h3','strong','bold','figcaption']
    for x in columns:
        print(x)
        for y in range(len(df)):
            df[x][y] = clean_text(df[x][y])

In [None]:
# perform preprocessing
df_test = df_test.assign(img_alt='', title='', h1='', h2='', h3='', strong='', bold='', lang_code='', figcaption='')
retrieve_features(df_test)
convert_features_toString(df_test)
clean_dataframe(df_test)
df_test = df_test.drop(columns=['html'], axis=1)
datasaver_to_ndjson(df=df_test, name='df_test_preprocessed')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, i][j] = getImgDescriptionHTMLtag(df.html[j])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, i][j] = getTitleHTMLtag(df.html[j])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, i][j] = getH1HTMLtag(df.html[j])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, i][j] = ge

img_alt
title
h1
h2
h3
strong
bold
figcaption


# Datei(en) exportieren

In [None]:
df_train_work.to_json('train_preprocessed.json')

#Sentiment-Analysis

In [None]:
df_train['sentiment_analysis'] =''

In [None]:
# spalte aus welcher sentiment-analysis gemacht wird, in string casten
df_train['pure_text'] = df_train['pure_text'].astype(str)

In [None]:
for x in range(0, len(df_train)):
  df_train.sentiment_analysis[x] = round(TextBlob(df_train['pure_text'][x]).sentiment.polarity,2)

In [None]:
# Überprüfung des means
df_train.groupby('industry_label')['sentiment_analysis'].mean()