# Data Structures

## Pandas

In [1]:
import pandas as pd

### Preparation

In [8]:
# create dataframe with specific column names
corpus_df = pd.DataFrame(columns=['id', 'userurl', 'source', 'title', 'description', 'content', 'keywords'])

In [9]:
# append one row, fill by column name
corpus_df = corpus_df.append(
                    {'id': 0, 'userurl': 'url', 'source': 'source', 'title': 'title',
                     'description': 'description', 'content': 'content',
                     'keywords': 'keywords'}, ignore_index=True)

### Transformation

In [12]:
# define new column, being the concatenation of other columns
corpus_df["text"] = corpus_df["title"].map(str) + ' ' + corpus_df["description"] + ' ' + corpus_df[
            "content"].map(str) + ' ' + corpus_df["keywords"]

In [7]:
# drop columns
corpus_df = corpus_df.drop(['source', 'userurl', 'title', 'description', 'keywords', 'content'], axis=1)

In [None]:
# concatenate dfs: https://stackoverflow.com/questions/32444138/combine-a-list-of-pandas-dataframes-to-one-pandas-dataframe
df = pd.concat(list_of_dataframes)

### Storing/ Reading

In [None]:
# write and read csv
file_name = 'test.csv'
corpus_df.to_csv(file_name, sep='\t')
# diff encoding
corpus_df.to_csv(file_name, sep='\t', encoding='utf-8')
# without index values
corpus_df.to_csv(file_name, encoding='utf-8', index=False)
# append
with open(file_name, 'a') as f:
    df.to_csv(f, header=False)
# load
df_corpus = pd.read_csv(file_name, delimiter='\t')

In [None]:
# merging 
comparison = pd.merge(results_sap, results_doc2vec, how='inner', on=['doc_id']) # if on is not specified, it is done on the index?

## Dictionaries

In [None]:
#https://stackoverflow.com/questions/4530611/saving-and-loading-objects-and-using-pickle
with open(r"someobject.pickle", "wb") as output_file:
    cPickle.dump(d, output_file)

In [None]:
with open(r"someobject.pickle", "rb") as input_file:
    e = cPickle.load(input_file)

In [None]:
#https://stackoverflow.com/questions/3108042/get-max-key-in-
max(dict_m, key=int)

In [None]:
# get value by key, where the values are tuples!
def get_url_key(url_to_find):
    url_key = -1
    for key, value in docs_dict.items():
        if value[0][0] == url_to_find:
            url_key = key
    return url_key

In [None]:
#https://stackoverflow.com/questions/32957708/python-pickle-error-unicodedecodeerror
#https://stackoverflow.com/questions/9415785/merging-several-python-dictionaries

# Networ Communication
- Interacting with the outside world

## DB

In [None]:
import sys
sys.executable
!{sys.executable} -m pip install pyhdb

In [None]:
connection = pyhdb.connect(host="<mo-6770....>", port=<port>, user="<user>", password="<pass>") # dummy 

In [None]:
cursor = connection.cursor()

In [None]:
# sample query construction
query = ''' SELECT "USERURL","TITLE", "CONTENT", "DESCRIPTION", "SOURCE","KEYWORDS"
                    FROM REPO_."T.CONTENT"   
                    '''

In [None]:
# count how many rows match the query
N = cursor.execute("SELECT COUNT(*) FROM (" + query + ")").fetchone()[0]
        print('Fetching ', N, ' documents...')

In [None]:
cursor.execute(query)

In [None]:
# work row by row
for i in range(N):
    try:
        row = cursor.fetchone()

        if i % 10000 == 0:
            print('Processing document ', i)
        if row[0] is not None:
            userurl = row[0]
        else:
            userurl = ""
    except UnicodeDecodeError:
        continue
    except UnicodeEncodeError:
        continue

In [None]:
# fetch all rows
results = cursor.fetchall()

In [16]:
#http://thepythonguru.com/fetching-records-using-fetchone-and-fetchmany/

## HTTP

In [13]:
import requests
import json

In [15]:
user = 'client'
passw = 'dummypass'
url = 'https://<search>.com/api/v1/search'
headers = {'Content-Type': 'application/json'}

In [None]:
request_body = get_api_body(query)
resp = requests.post(url, data=request_body, auth=(user, passw), headers=headers)
resp_json = resp.json()
results = resp_json['result']

In [None]:
def get_api_body(self, query):
    data = ''' {
            "repository": "srh",
            "type": "content",
            "filters": [{
                    "field": "CONTENT",
                    "type": "fuzzy",
                    "values": ["''' + query + '''"],
                    "fuzzy": {
                        "level": 0.9,
                        "weight": 0.2,
                        "parameters": "def""
                    },
                    "logicalOperator": "or"
                },
                {
                    "field": "TITLE",
                    "type": "fuzzy",
                    "values": ["''' + query + '''"],
                    "fuzzy": {
                        "level": 0.9,
                        "weight": 1,
                        "parameters": "def""
                    }
                }
               '''
    return data

In [None]:
# results being an array of separate JSON objects
def get_results_as_df(self, results):
    results_df = pd.DataFrame(columns=['doc_id', 'userurl', 'title'])
    # results[i] to access each subsequent/ separate JSON element
    for i in range(len(results)):
        results_df = results_df.append(
            {'doc_id': doc_id, 'userurl': results[i]['USERURL'], 'title': results[i]['TITLE'],
             'keywords': results[i]['KEYWORDS']}, ignore_index=True)
    return results_df

# Hardware Utilization
- usefull to check effects is htop, e.g. see https://peteris.rocks/blog/htop/

In [None]:
# results being a list of tuple(any) elements
def get_pool_data(results):
    
    pool = mp.Pool()
    deserialized_results_list = list(map(deserialize, results))
    
    results_mp = pool.map(preprocess_row, deserialized_results_list)   
    df_global = pd.concat(results_mp)
    
    return df_global

# Web Deployment with Flask
- small search & table view example

In [None]:
import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

import os
import json

from os.path import dirname, realpath
from doc2vec_search import Doc2VecSearch
from time import time
from flask import Flask, Response, request

PATH_TO_MODEL = "data/models/model_dbow_100_10_no_ppl.d2v"
PATH_TO_DICT = "data/dict/docs_dict.pickle"

if 'is_docker' in os.environ:
    CWD = "/app/data"
else:
    CWD = dirname(dirname(realpath(__file__))) + "/data"

app = Flask(__name__, template_folder=dirname(realpath(__file__)))

doc2vec = Doc2VecSearch(PATH_TO_MODEL, PATH_TO_DICT)
url = "/unique/search"

@app.route(url, methods=['GET'])
def doc2vec_search():
    q = request.args.get('q')

    exec_time = 0
    response = dict()

    if q:
        q = q.lower()
        start = time()
        results = doc2vec.search(q, 10)
        exec_time = int((time() - start) * 1000)

        if results:
            for i in range(len(results)):
                response['data'+str(i)] = [{'userurl': results[i][0], 'title': results[i][1], 'keywords': results[i][2],
                                   'source': results[i][3]}]
                response['metadata'+str(i)] = {'executionTime': exec_time, 'status': 200, 'itemCount': 1}
            return Response(json.dumps(response, indent=2), status=200, mimetype='application/json')

    response['data'] = []
    response['metadata'] = {'executionTime': exec_time, 'status': 200, 'itemCount': 0}
    return Response(json.dumps(response, indent=2), status=200, mimetype='application/json')


@app.route('/healthcheck')
def healthckeck():
    return "All is well"

@app.route('/' + 'pointer/testing')
def testing():
    page = """
    <!DOCTYPE html>
    <html>
    <head>
        <link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css">
    </head>
        <body>
            <form>
                <input name="query" autofocus>
                <input type="submit">
                Number of results to show:
                <input name="n">
            </form>
            <br>
    """

    q = request.args.get('query')
    n = request.args.get('n')

    if q:
        q = q.lower().strip()
        page += "Current query: <b><i>" + q + "</i></b><br>\n"

        if n:
            n = int(n)
            results = doc2vec.search(q, n)
        else:
            results = doc2vec.search(q, 10)

        page += "<h2>Doc2Vec Search Results</h2>"
        page += """<table class="w3-table-all"><tr><th></th><th>Userurl</th><th>Title</th><th>Keywords</th><th>Source</th><th>Similarity Score</th></tr>"""

        if results:
            for i in range(len(results)):
                page += "<tr><td>" + str(i) + "</td><td>" + results[i][0] + "</td><td>" + results[i][1] + "</td><td>" + results[i][2] + "</td><td>" + results[i][3]+ "</td><td>" + "{0:.2f}".format(results[i][4])  + "</td><tr>"
        page += "</table>"
    return page + "</body></html>"

if __name__ == "__main__":
    # This is only called when starting file directly. Not in Docker container.
    logger.info("Api is ready. Try: http://localhost:5021/test/doc2vec?q=mster%20data%20mannagemnt")
    app.run(port=5021)

# Useful things

- https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
- https://jakevdp.github.io/PythonDataScienceHandbook/01.05-ipython-and-shell-commands.html

In [None]:
for i in range(n):
    if i < 136000:
        print('skiping ',i)
        continue

In [None]:
import time

start = time.time()
print("hello")
end = time.time()
print(end - start)

In [None]:
import glob, os
count = 1
# Read in all files from the current directory, that match the prefix. 
for file in glob.glob("archive_sitemap_*"):
    print(file)

In [None]:
#https://stackoverflow.com/questions/53513/how-do-i-check-if-a-list-is-empty
if not a:
    print("List is empty")

In [None]:
# Screen and Sessions
# https://stackoverflow.com/questions/1509677/kill-detached-screen-session

In [None]:
# Remote connections
#https://superuser.com/questions/23911/running-commands-on-putty-without-fear-of-losing-connection