In [1]:
from contextlib import closing
import json
import pandas as pd
import re
import requests
import snowflake.connector
import time

In [2]:
schema = 'ANALYST'

In [3]:
sql = """
        SELECT I.CHANNEL_ID, I.VIDEO_ID, I.VIDEO_CATEGORY AS IAS_CATEGORY, I.IAS_RISK, S.TAXONOMY_NAME, 
        S.SCORE, S.DECISION
        FROM prod.ANALYST.IAS_Q122 I
        JOIN prod.REPORTING.LATEST_SCORES AS S 
            ON I.VIDEO_ID = S.CONTENT_ID 
        WHERE decision 
        """

In [4]:
def connection(schema):
    conn = snowflake.connector.connect(
        host="localhost",
        port="10015",
        user="sdm",
        password="sdm",
        account="dummy",
        protocol="http",
        warehouse="ENGINEER_WH",
        database="PROD",
        schema=schema,
    )
    return conn

In [5]:
def get_results(sql):
    conn = connection(schema)
    with closing(conn.cursor()) as cursor:
        results = conn.cursor().execute(sql)
        
    # Fetch the result set from the cursor and deliver it as the Pandas DataFrame.
    df = results.fetch_pandas_all()
    return df

In [6]:
df = get_results(sql)

In [7]:
df.head()

Unnamed: 0,CHANNEL_ID,VIDEO_ID,IAS_CATEGORY,IAS_RISK,TAXONOMY_NAME,SCORE,DECISION
0,UCYHLd3HtyhRYsD5QMxsjRJA,_25OArlHkYU,Film & Animation,Medium,4A - Crime - [Minimal],1.0,True
1,UCj1J3QuIftjOq9iv_rr7Egw,zwKnHMwVJ50,Gaming,Medium,4A - Online Piracy - [Minimal],1.0,True
2,UCRZamIB-DWuVPJuc0bKhwzg,3rB3HI9_MDA,Education,Medium,Japanese Brand Safety - V1,0.16567236,True
3,UCEth8Mr49WNJNdtn7Dy3rqA,vMU2sKAe0FY,People & Blogs,Medium,English - V1,0.9405212,True
4,UCvgfXK4nTYKudb0rFR6noLA,rZ5IgMydsT4,Sports,Medium,4A - Terrorism - [Minimal],1.0,True


In [8]:
import math
import re
from collections import Counter
import pandas as pd

WORD = re.compile(r"\w+")


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [9]:

    df['vector1']=df['IAS_CATEGORY'].apply(lambda x: text_to_vector(x)) 
    df['vector2']=df['TAXONOMY_NAME'].apply(lambda x: text_to_vector(x)) 
    df['simscore']=df.apply(lambda x: get_cosine(x['vector1'],x['vector2']),axis=1)

In [13]:
#dataframe where IAS has a safety category

ias_safety_flag = df[df['IAS_RISK'] == 'High']
ias_safety_flag.to_pickle('IAS_Safety_Flag_Risk_Data.pkl')

In [10]:
#dataframe where zefr has a safety category

zefr_safety_flag = df[df['TAXONOMY_NAME'].str.contains("4A")]
zefr_safety_flag.to_pickle('ZEFR_Safety_Flag_Risk_Data.pkl')

In [15]:
#dataframe where DV and zefr have similar concept categories

dv_zefr_similar_concepts = df[df['simscore']>0.5]
dv_zefr_similar_concepts.to_pickle('DV_Zefr_SimilarConcept_Risk_Data.pkl')