# Data Collection 

This notebook contains all necessary steps for data scraping and preprocessing. 

In [None]:
import nltk
import os
import PyPDF2
import pickle
import pandas as pd
import textract
import spacy
from stop_words import get_stop_words
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_multiple_whitespaces, strip_short

spacy.load('de_core_news_sm')

In [None]:
# Run once and download sources
# nltk.download()

In [None]:
# Run once and download German language resources
# !  python3 -m spacy download de_core_news_sm

## Data Ingest

In [None]:
# Ingest raw unlabelled CRQ data to list of strings and pickle

data = open("../Data/CRQ/Unlabelled_crs.txt", 'r')

raw_data = data.read()
raw_data = raw_data.replace('""',"'")
raw_data_sep = raw_data.split('"')

raw_data_sep_nonempty = [element for element in raw_data_sep if element!="\n"]

#print(raw_data_sep_nonempty[4344]) 

pickle.dump(raw_data_sep_nonempty, open("../Data/CRQ/CRQ_raw.pkl", "wb" ) )

In [None]:
# Ingest labelled CRQ, convert to list of strings and pickle

test_data = pd.read_pickle("../Data/CRQ/testset-Copy1.pkl")
train_data = pd.read_pickle("../Data/CRQ/trainset-Copy1.pkl")

CRQ_train_labelled = train_data['CR'].tolist()
CRQ_test_labelled = test_data['CR'].tolist()

#print(CRQ_train_labelled[123])
#print(CRQ_test_labelled[123])

pickle.dump(CRQ_train_labelled, open("../Data/CRQ/CRQ_train.pkl",'wb'))
pickle.dump(CRQ_test_labelled, open("../Data/CRQ/CRQ_test.pkl",'wb'))

In [None]:
# Ingest Books: Liefert eine Liste von Strings mit jedem Buch als ein String. 

path = "/home/seb/Capstone Project/Data/Books"
extract =[]

def book_to_string(path):
    content=""
    file = open(path, "rb")
    book = PyPDF2.PdfFileReader(file)
    for i in range(book.getNumPages()):
        page = book.getPage(i)
        content += page.extractText()
    return(content)

for file in os.listdir(path):
    extract.append(book_to_string(os.path.join(path, file)))
pickle.dump(extract, open("../Data/Books_raw.pkl",'wb')) 

In [None]:
# Ingest Patents: Liefert eine Liste von Strings mit jedem Buch als ein String. 

path = "/home/seb/Capstone Project/Data/Patents"
extract =[]

def patent_to_string(path):
    content = textract.process(path)
    content = content.decode("utf-8", "strict") 
    return(content)

for file in os.listdir(path):
    extract.append(patent_to_string(os.path.join(path, file)))
pickle.dump(extract, open("../Data/Patents_raw.pkl",'wb'))  

In [None]:
# Check structure, length and sample content of raw data
with open("../Data/Books_raw.pkl", 'rb') as file:
    data = pickle.load(file)
    print(len(data), type(data), type(data[0]))
with open("../Data/Patents_raw.pkl", 'rb') as file:
    data = pickle.load(file)
    print(len(data), type(data), type(data[0]))
    print(data[122])

## Preprocessing the data

In [None]:
# Preprocessing Pipeline
# Takes list of strings as pkl
# Returns list of lists as pkl
# Steps:
# Remove Punctuation
# Remove Numerics
# Remove short words (short being n=1)
# Remove multiple whitespace
# lowercasing
# split string into words
# remove stopwords
# lemmatize w/ spacy lemmatizer (takes forever, hence skipped for now)


def preprocess(path_in, path_out):
    
    def split_words(string):
        return string.split()
        
    def remove_stopwords(list):
        result = [word for word in list if not word in stop_words]
        return result
    
    def lemmatize(list):
        list_lemm = []
        for word in list:
            doc = nlp(word)
            for element in doc:
                list_lemm.append(element.lemma_) 
        return list_lemm

    nlp = spacy.load("de_core_news_sm")
    stop_words = get_stop_words('german')
    
    CRQ_data = pickle.load( open(path_in, "rb" ) )
    CRQ_data = [strip_punctuation(s) for s in CRQ_data]
    CRQ_data = [strip_numeric(s) for s in CRQ_data]
    CRQ_data = [strip_multiple_whitespaces(s) for s in CRQ_data]
    CRQ_data = [s.lower() for s in CRQ_data]
    CRQ_data = [strip_short(s, minsize=3) for s in CRQ_data]
    CRQ_data = [split_words(s) for s in CRQ_data]
    CRQ_data = [remove_stopwords(l) for l in CRQ_data]
    #CRQ_data = [lemmatize(element) for element in CRQ_data]
    
    pickle.dump(CRQ_data, open(path_out, 'wb'))

In [None]:
# preprocess unlabelled CRQ
preprocess("../Data/CRQ/CRQ_1.pkl", "../Data/CRQ_preprocessed.pkl")

In [None]:
# preprocess labelled CRQ
preprocess("../Data/CRQ/CRQ_train.pkl", "../Data/CRQ_train_preprocessed.pkl")
preprocess("../Data/CRQ/CRQ_test.pkl", "../Data/CRQ_test_preprocessed.pkl")

In [None]:
# preprocess patents
preprocess("../Data/Patents_raw.pkl", "../Data/Patents_preprocessed.pkl")

In [None]:
# preprocess books
preprocess("../Data/Books_raw.pkl","../Data/Books_preprocessed.pkl")

In [None]:
# Check sample content of preprocessed data
with open("../Data/Patents_preprocessed.pkl", 'rb') as file:
    data = pickle.load(file)
    print(data[210])