# Data Structures

## Pandas

In [1]:
import pandas as pd

### Preparation

In [8]:
# create dataframe with specific column names
corpus_df = pd.DataFrame(columns=['id', 'userurl', 'source', 'title', 'description', 'content', 'keywords'])

In [9]:
# append one row, fill by column name
corpus_df = corpus_df.append(
                    {'id': 0, 'userurl': 'url', 'source': 'source', 'title': 'title',
                     'description': 'description', 'content': 'content',
                     'keywords': 'keywords'}, ignore_index=True)

### Transformation

In [12]:
# define new column, being the concatenation of other columns
corpus_df["text"] = corpus_df["title"].map(str) + ' ' + corpus_df["description"] + ' ' + corpus_df[
            "content"].map(str) + ' ' + corpus_df["keywords"]

In [7]:
# drop columns
corpus_df = corpus_df.drop(['source', 'userurl', 'title', 'description', 'keywords', 'content'], axis=1)

### Storing/ Reading

In [None]:
# write and read csv
file_name = 'test.csv'
corpus_df.to_csv(file_name, sep='\t')
# diff encoding
corpus_df.to_csv(file_name, sep='\t', encoding='utf-8')
# without index values
corpus_df.to_csv(file_name, encoding='utf-8', index=False)
# load
df_corpus = pd.read_csv(file_name, delimiter='\t')

In [None]:
# merging 
comparison = pd.merge(results_sap, results_doc2vec, how='inner', on=['doc_id']) # if on is not specified, it is done on the index?

## Dictionaries

In [None]:
# get value by key, where the values are tuples!
def get_url_key(url_to_find):
    url_key = -1
    for key, value in docs_dict.items():
        if value[0][0] == url_to_find:
            url_key = key
    return url_key

# Networ Communication
- Interacting with the outside world

## DB

In [None]:
import sys
sys.executable
!{sys.executable} -m pip install pyhdb

In [None]:
connection = pyhdb.connect(host="<mo-6770....>", port=<port>, user="<user>", password="<pass>") # dummy 

In [None]:
cursor = connection.cursor()

In [None]:
# sample query construction
query = ''' SELECT "USERURL","TITLE", "CONTENT", "DESCRIPTION", "SOURCE","KEYWORDS"
                    FROM REPO_."T.CONTENT"   
                    '''

In [None]:
# count how many rows match the query
N = cursor.execute("SELECT COUNT(*) FROM (" + query + ")").fetchone()[0]
        print('Fetching ', N, ' documents...')

In [None]:
cursor.execute(query)

In [None]:
# work row by row
for i in range(N):
    try:
        row = cursor.fetchone()

        if i % 10000 == 0:
            print('Processing document ', i)
        if row[0] is not None:
            userurl = row[0]
        else:
            userurl = ""
    except UnicodeDecodeError:
        continue
    except UnicodeEncodeError:
        continue

## HTTP

In [13]:
import requests
import json

In [15]:
user = 'client'
passw = 'dummypass'
url = 'https://onedx.find.sap.com/api/v1/search'
headers = {'Content-Type': 'application/json'}

In [None]:
request_body = get_api_body(query)
resp = requests.post(url, data=request_body, auth=(user, passw), headers=headers)
resp_json = resp.json()
results = resp_json['result']

In [None]:
def get_api_body(self, query):
    data = ''' {
            "repository": "srh",
            "type": "content",
            "filters": [{
                    "field": "CONTENT",
                    "type": "fuzzy",
                    "values": ["''' + query + '''"],
                    "fuzzy": {
                        "level": 0.9,
                        "weight": 0.2,
                        "parameters": "def""
                    },
                    "logicalOperator": "or"
                },
                {
                    "field": "TITLE",
                    "type": "fuzzy",
                    "values": ["''' + query + '''"],
                    "fuzzy": {
                        "level": 0.9,
                        "weight": 1,
                        "parameters": "def""
                    }
                }
               '''
    return data

In [None]:
# results being an array of separate JSON objects
def get_results_as_df(self, results):
    results_df = pd.DataFrame(columns=['doc_id', 'userurl', 'title'])
    # results[i] to access each subsequent/ separate JSON element
    for i in range(len(results)):
        results_df = results_df.append(
            {'doc_id': doc_id, 'userurl': results[i]['USERURL'], 'title': results[i]['TITLE'],
             'keywords': results[i]['KEYWORDS']}, ignore_index=True)
    return results_df

# Text

In [None]:
from nltk.corpus import stopwords

In [2]:
def preprocess(self, doc):

    stop_words = stopwords.words('english')
    content = re.sub('[^A-Za-z]+', ' ', doc)
    content = content.lower().split()
    content = ' '.join([word for word in content if word not in stop_words and len(word) > 1])

    return content

# Useful links

- https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
- https://jakevdp.github.io/PythonDataScienceHandbook/01.05-ipython-and-shell-commands.html
- 