In [1]:
# Library files import

import cerberus
import json
import os
import re
import numpy as np
import string
from collections import Counter 

In [2]:
# building validation dict based on the structure below

# {
#  "_id": "https://www.ekmhinnovators.com/ekmh-innovators-blog-beta/interview-ourcrowd-ceo-jon-medved-on-impact-investing-crowdfunding",
#  "title": "Interview: OurCrowd CEO Jon Medved on Crowdfunding, Beyond ...",
#  "body": "EKMH Innovators Interview Series An interview ...",
#  "origin": "google custom search",
#  "feedId": 103,
#  "jobId": "37b3e04c-cf7d-4032-82ad-a2bd89dc90ac",
#  "person": {
# 	 "id": "16",
# 	 "name": "Jon Medved"
#  }
# }

STRING_MANDATORY = {'type': 'string', 'empty': False, 'required': True}
INT_MANDATORY = {'type': 'integer', 'empty': False, 'required': True}
VALIDATION_SCHEMA = {
    '_id': STRING_MANDATORY, 
    'title': STRING_MANDATORY, 
    'body': STRING_MANDATORY, 
    'origin': STRING_MANDATORY, 
    'feedId': INT_MANDATORY, 
    'jobId': STRING_MANDATORY, 
    'person': {
        'type': 'dict', 'required': True, 'empty': False, 
            'schema': {
                'id': STRING_MANDATORY,
                'name': STRING_MANDATORY
            }
        }
    }

In [3]:
# displaying validation dictionary

VALIDATION_SCHEMA

{'_id': {'type': 'string', 'empty': False, 'required': True},
 'title': {'type': 'string', 'empty': False, 'required': True},
 'body': {'type': 'string', 'empty': False, 'required': True},
 'origin': {'type': 'string', 'empty': False, 'required': True},
 'feedId': {'type': 'integer', 'empty': False, 'required': True},
 'jobId': {'type': 'string', 'empty': False, 'required': True},
 'person': {'type': 'dict',
  'required': True,
  'empty': False,
  'schema': {'id': {'type': 'string', 'empty': False, 'required': True},
   'name': {'type': 'string', 'empty': False, 'required': True}}}}

In [4]:
# Path of the data files
# Add more files to the project folder or change this path to read from any 
# other directory path
DATA_FOLDER = os.path.join(os.path.curdir, 'data') # by default it takes the JSONs from project folder

# only considering JSON files at the moment
interview_files_list = [os.path.join(DATA_FOLDER, f) for f in os.listdir(DATA_FOLDER) \
                        if '.json' in os.path.splitext(f) ]


print(interview_files_list)
if len(interview_files_list) < 1:
    proceed = False
    error = "No data file available"

['./data/0.json', './data/1.json', './data/2.json', './data/3.json', './data/4.json', './data/5.json']


In [5]:
## Common Methods

In [6]:
"""
validate input data schema
"""
def validate_schema(entities, schema, filepath):
    checker = cerberus.Validator()
    checker.allow_unknown = True
    if checker.validate(entities, schema):
        print(f'Input schema validated for {filepath} !')
    else:
        errors = checker.errors
        proceed = False
        error = errors
        raise ValueError(f'Format mismatch for the input {errors}')

In [7]:
"""
Remove special character
"""
def remove_special_character(text_to_process, discarded_elements=[], strip=False):
    
    new_string = str(text_to_process)
    if strip:
        new_string = new_string.strip()
        
    special_chars = string.printable[62:]
    special_chars = [x for x in special_chars if x not in discarded_elements]
    for char in text_to_process:
        if char in special_chars:
            new_string = new_string.replace(char, '')
    return new_string

In [8]:
"""
Remove special character without spaces
"""
def remove_special_character_without_spaces(text_to_process, discarded_elements=[]):
    return remove_special_character(text_to_process, [' '], strip=True)

In [9]:
"""
Remove spaces
"""
def remove_spaces(text_to_process):
    return text_to_process.replace(" ", "")

In [10]:
"""
Validate and read the JSON data
file_path: path of the json file
"""
def read_and_validate_data(filepath):
    with open(filepath) as json_file:
        data = json.load(json_file)
        validate_schema(entities=data, schema=VALIDATION_SCHEMA,\
                        filepath=filepath)
    return data

In [11]:
"""
Encode Text
"""
def encode_text(text_to_encode, char_to_index):
    return np.array([char_to_index[c] for c in text_to_encode])

In [12]:
"""
Reverse the array
"""
def reverse_the_array(arr):
    return arr[::-1]

In [13]:
# Custom method and Processing

In [31]:
def predict_interviewee(name_doc_mapping={}):
    predicted_interviewee = {}
    for key, value in name_doc_mapping.items():
        item_list = max(value.items(), key=lambda x: x[1])
        labels = list()
        for k, v in value.items():
            if v == item_list[1]:
                predicted_interviewee[key] = k
    return predicted_interviewee

In [32]:
# Reading in the data input files while also 
# validating them
data = [read_and_validate_data(file) for file in interview_files_list]

Input schema validated for ./data/0.json !
Input schema validated for ./data/1.json !
Input schema validated for ./data/2.json !
Input schema validated for ./data/3.json !
Input schema validated for ./data/4.json !
Input schema validated for ./data/5.json !


In [33]:
# def create_interview_encoding(interview_contents):
#     total_chars = sorted(set(interview_contents))
#     char_to_index = {char:index for index, char in enumerate(total_chars)}
#     index_to_char = np.array(total_chars)
#     encoded_text = np.array([char_to_index[c] for c in interview_contents])
#     return char_to_index, index_to_char, encoded_text

In [34]:
# # Predict interviewer and interviewee name
# interview_content = str.lower(data[0]['body'])
# interview_content_no_special_chars = remove_special_character(interview_content, [':'])
# char_to_index, index_to_char, encoded_text = \
#     create_interview_encoding(interview_content_no_special_chars)


# interviewee = data[0]['person']['name']
# sub_names = interviewee.split()
# sub_names.append(interviewee)
# names = [encode_text(remove_special_character(x.lower())\
#                      , char_to_index) for x in sub_names]

# reverse_encoded_text=reverse_the_array(encoded_text)

In [35]:
# interview_content = str.lower(data[2]['body'])
# interview_content_no_special_chars = remove_special_character(interview_content, [':', '.', '?', ';', '"'])
# print(interview_content_no_special_chars[:100])
# reverse_encoded_text=reverse_the_array(interview_content_no_special_chars)
# print(reverse_encoded_text)

In [36]:
# reverse_encoded_text

In [37]:
# expression_one = '(?<=\:)(.*?)(?=\{})'

# all_matches = []
# count_dict = {}

# exprsns = ['.', '?', ";"]
# for ex in exprsns:
#     expression = expression_one.format(ex)
#     matches = re.findall(expression, reverse_encoded_text)
#     all_matches = all_matches + matches

# counter = Counter(all_matches)

# for elm in set(all_matches):
#     if len(elm) < 25:
#         count_dict[elm] = counter[elm]
# #         print(type(counter[elm]))
        
    
# # expression_two = r"(?<=\:)(.*?)(?=\?)"
# # matches_one = re.findall(expression_one, reverse_encoded_text)
# # matches_two = re.findall(expression_two, reverse_encoded_text)

# # matches = matches_one + matches_two
 

In [38]:

# d = Counter(matches_one)
# for elem in set(matches):
#     if len(elem) < 15:
#         print(elem, d[elem])

In [39]:
# print(matches_two) 

In [40]:
# s = reverse_encoded_text
# start = s.find(':')
# end = s.find('.')
# substring = s[start:end]
# print(substring)

In [41]:
# buckets = []

# def find_pattern(text_data, names=[]):
#     if not names:
#         raise Exception(f'Pattern could not be found, names = {names}')
#     text_split=text_data.split(':')
#     name_dict = {}
    
#     for name in names:
#         for text in text_split:
#             name_count = name_dict.get(name, 0)
#             name_dict[name] = text.count(name) + name_count
#             if text.find(name) == 0:
#                 print(name, text[:10], text.find(name))

        
    
# #     print(name_dict)
        

In [42]:

doc = {}
expression_one = '(?<=\{})(.*?)(?=\:)'
exprsns = ['.', '?', ";"]

doc_exprsn = {}


for idx, entity in enumerate(data):
    count_dict = {}
    all_matches = []
    interview_text = str.lower(entity.get('body'))
    interview_without_spl_chr = interview_text
    name_dict = {}
    interviewee = entity.get('person')['name']
    sub_names = interviewee.split()
    sub_names.append(interviewee)
    names = [x.lower() for x in sub_names]
    for name in names:
        name_dict[name] = len([pos.start() for pos in re.finditer(f'{name}:', interview_without_spl_chr)])
    doc[f'doc_{idx}'] = name_dict
    for ex in exprsns:
        expression = expression_one.format(ex)
        matches = re.findall(expression, interview_without_spl_chr)
        all_matches = all_matches + matches
    all_matches = list(map(remove_special_character_without_spaces, all_matches))
    counter = Counter(all_matches)
    for elm in set(all_matches):
        if len(elm) < 25:
            count_dict[elm] = counter[elm]
    doc_exprsn[f'doc_{idx}'] = count_dict


In [43]:
print(predict_interviewee(doc))

{'doc_0': 'joe', 'doc_1': 'srinivasan', 'doc_2': 'jon medved', 'doc_3': 'phil libin', 'doc_4': 'mackey', 'doc_5': 'tim cook'}


In [44]:
doc_exprsn

{'doc_0': {'joe': 16,
  'martin': 2,
  'i appreciate it martin': 1,
  '” then they say': 1},
 'doc_1': {'srinivasan': 12,
  'srinivisan': 1,
  'jackson': 2,
  'balaji s srinivasan': 1},
 'doc_2': {'jon medved': 8, 'ekmh': 1},
 'doc_3': {'phil libin': 53,
  'that too nicole torres': 1,
  'yeah nicole torres': 1,
  'laughter phil libin': 1,
  'nicole torres': 18},
 'doc_4': {' mackey': 1, 'john mackey': 1, 'mackey': 19, 'reason': 1},
 'doc_5': {'tim cook': 30,
  'angela ahrendts': 1,
  'jony ive': 8,
  ' tim cook': 2,
  'charlie rose': 14,
  'graham townsend': 1,
  'phil schiller': 1}}

In [50]:
def predict_interviewers(doc_exprsn):
    potential_interviewers = {}
    for key, value in doc_exprsn.items():
        print(key, value)
        
        
#         item_list = max(value.items(), key=lambda x: x[1])
#         labels = list()
#         for k, v in value.items():
#             if v == item_list[1]:
#                 predicted_interviewee[key] = k
#     return predicted_interviewee 

In [51]:
predict_interviewers(doc_exprsn)

doc_0 {'joe': 16, 'martin': 2, 'i appreciate it martin': 1, '” then they say': 1}
doc_1 {'srinivasan': 12, 'srinivisan': 1, 'jackson': 2, 'balaji s srinivasan': 1}
doc_2 {'jon medved': 8, 'ekmh': 1}
doc_3 {'phil libin': 53, 'that too nicole torres': 1, 'yeah nicole torres': 1, 'laughter phil libin': 1, 'nicole torres': 18}
doc_4 {' mackey': 1, 'john mackey': 1, 'mackey': 19, 'reason': 1}
doc_5 {'tim cook': 30, 'angela ahrendts': 1, 'jony ive': 8, ' tim cook': 2, 'charlie rose': 14, 'graham townsend': 1, 'phil schiller': 1}


In [46]:

    

    # print('Keys with maximum Value in Dictionary : ', listOfKeys)

In [47]:
predicted_interviewee

NameError: name 'predicted_interviewee' is not defined