# Helpers

In [None]:
import os
import pickle
from pathlib import Path
from os import listdir
from os.path import isfile, join

def path_of(location):
    me_dir, me_file= os.path.split(os.path.abspath(__file__))
    return os.path.join(me_dir, location)


def get_all_files(location, get_full_path= False):
    location= path_of(location)
    onlyfiles = [f for f in listdir(location) if isfile(join(location, f))]
    if get_full_path:
        onlyfiles= [join(location, f) for f in onlyfiles]
    return onlyfiles


def load_pkl(filename):
    filename= path_of(filename)
    data= None
    with open(filename, "rb") as handle:
        data= pickle.load(handle)
        handle.close()
    return data

def store_pkl(object, filename):
    filename= filename
    with open(filename, "wb") as handle:
        pickle.dump(object, handle, protocol=pickle.HIGHEST_PROTOCOL)
        handle.close()

def is_valid_file(filename):
    filename= path_of(filename)
    file= Path(filename)
    if file.is_file():
        return True
    return False


def to_same_shape(arr_of_items, required_shape):
    if len(arr_of_items) == 0:
        print("Error: tried to make shape: ", required_shape, ", but item is empty...")
        exit()
    if len(arr_of_items)>required_shape:
        return arr_of_items[:required_shape]
    res= []
    ind=0
    while len(res)<required_shape:
        res.append(arr_of_items[ind])
        ind= (ind+1)%len(arr_of_items)
    return res

# Kg_encoder

In [None]:
from tqdm import tqdm
import numpy as np

#import helpers

vector_file= "/content/drive/MyDrive/sony_IITPatna/word_vector_300d.vec"

def get_unique_words(ctnr:set, phases:list):
    for words in phases:
        for word in words:
            if len(word) > 0:
                ctnr.add(word)

def get_vector(word_vec:dict, words:set):
    with open(vector_file) as file:
        for line in tqdm(file):
            word_and_vals= line.split(" ")
            w= word_and_vals[0]
            if w in words:
                vec= word_and_vals[1:]
                vec= list(map(float, vec))
                vec= np.array(vec, dtype=float)
                word_vec.update({
                    w: vec
                })
    file.close()

def split_by_words(arr:list):
    res= []
    for ele in arr:
        words= ele.split(" ")
        r= list()
        for w in words:
            if len(w)>0:
                r.append(w)
        res.append(r)
    return res
                    
def avg_vec(words_arr:list, word_vec:dict):
    res= np.zeros(300)
    found_words= 0
    for w in words_arr:
        if not word_vec.get(w) is None:
            res= res+word_vec.get(w)
            found_words+=1
    if found_words>0:
        res= res/found_words
    return res


def transE_single(head, relation, tail, word_vec):
    head_v= avg_vec(head, word_vec)
    relation_v= avg_vec(relation, word_vec)
    tail_v= avg_vec(tail, word_vec)
    enc= head_v+relation_v-tail_v
    return enc


def transE(head:list, relation:list, tail:list, word_vec:dict):
    if len(head) != len(relation) or len(head)!= len(tail):
        raise "\n\tError!! head, tail, relation doesnot have same length\n"
    res= []
    for index in range(len(head)):
        vec= transE_single(head[index],relation[index], tail[index], word_vec)
        res.append(vec)
    res= np.array(res)
    return res




def encode(head:list, relation:list, tail:list):
    unique_words= set()
    
    head= split_by_words(head)
    relation= split_by_words(relation)
    tail= split_by_words(tail)

    get_unique_words(unique_words, head)
    get_unique_words(unique_words, relation)
    get_unique_words(unique_words, tail)

    word_vec= dict()    
    get_vector(word_vec, unique_words)

    res= transE(head, relation, tail, word_vec)
    return res



In [None]:
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm


import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

nlp = spacy.load('en_core_web_sm')




In [None]:
def get_entities(sent):
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""
    prv_tok_text = ""

    prefix = ""
    modifier = ""

    
    for tok in nlp(sent):
        if tok.dep_ != "punct":
            if tok.dep_ == "compound":
                prefix = tok.text
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " "+ tok.text

            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " "+ tok.text

            if tok.dep_.find("subj") == True:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""            

            if tok.dep_.find("obj") == True:
                ent2 = modifier +" "+ prefix +" "+ tok.text

            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

    return [ent1.strip(), ent2.strip()]


def get_relation(sent):
    doc = nlp(sent)

    matcher = Matcher(nlp.vocab)

    pattern =           [{'DEP':'ROOT'}, 
                        {'DEP':'prep','OP':"?"},
                        {'DEP':'agent','OP':"?"},
                        {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1", None, pattern)
    matches = matcher(doc)
    k = len(matches) - 1
    span = doc[matches[k][1]:matches[k][2]]
    return(span.text)


def get_elements(sent_arr):
    head= []
    tail= []
    relation= []

    for sen in tqdm(sent_arr):
        try:
            entity= get_entities(sen)
            rel= get_relation(sen)
        except:
            entity= ["None", "None"]
            rel= "None"
        head.append(entity[0])
        tail.append(entity[1])
        relation.append(rel)
    return head, relation, tail


def draw_graph(head, relation, tail):
    kg_df = pd.DataFrame({'source':head, 'target':tail, 'edge':relation})
    G=nx.from_pandas_edgelist(kg_df, "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
    # G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="composed by"], "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
    # G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="written by"], "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
    # G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="released in"], "source", "target", edge_attr=True, create_using=nx.MultiDiGraph())
    
    plt.figure(figsize=(12,12))
    pos = nx.spring_layout(G)
    nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
    plt.show()


def inner_preprocess_sen(sen:str):
    sen= sen.rstrip("\n").lstrip(" ").rstrip(" ").lstrip("\"").rstrip("\"")
    return sen
def preprocess_sen(sen):
    if type(sen) == str:
        return inner_preprocess_sen(sen)
    elif type(sen) == list:
        res= list()
        for s in sen:
            res.append(inner_preprocess_sen(s))
        return res
    else:
        raise "\n\tPreprocess sen takes string or list of string...\n"


def build_and_encode(sentence_arr:list):
    sen_arr= []
    for sen in sentence_arr:
        sen= preprocess_sen(sen)
        sen_arr.append(sen)
    h,r,t= get_elements(sen_arr)
    #import kg_encoder as kg_encoder
    encoded= encode(h,r,t)
    return encoded

In [None]:
# sen_arr= []
# lines_to_run_for, count= 5000, 0
# with open("./kg_ip.txt") as file:
#         for line in file:
#             count+=1
#             if count == lines_to_run_for: break
#             sen= preprocess_sen(line)
#             sen_arr.append(sen)
# file.close()
# h,r,t= get_elements(sen_arr)
# draw_graph(h,r,t)

# 01_extract_meta

In [None]:
from cgi import print_directory
from os import path
import csv
'''
0  :  
1  :  Unnamed: 0
2  :  Unnamed: 0.1
3  :  Unnamed: 0.1.1
4  :  Unnamed: 0.1.1.1
5  :  Unnamed: 0.1.1.1.1
6  :  Unnamed: 0.1.1.1.1.1
7  :  movie id
8  :  movie title
9  :  release date
10  :  video release date
11  :  IMDb URL
12  :  unknown
13  :  Action
14  :  Adventure
15  :  Animation
16  :  Children's
17  :  Comedy
18  :  Crime
19  :  Documentary
20  :  Drama
21  :  Fantasy
22  :  Film-Noir
23  :  Horror
24  :  Musical
25  :  Mystery
26  :  Romance
27  :  Sci-Fi
28  :  Thriller
29  :  War
30  :  Western
31  :  Summary
32  :  Cast
33  :  Director
34  :  Rating
35  :  Runtime
36  :  No. of ratings
37  :  YT-Trailer ID
'''
#items_main_file= path_of("/content/drive/MyDrive/sony_IITPatna/folder_1/items.csv")

# w2v_file= helpers.path_of("../word_vector_300d.vec")

op_dict= dict()

with open(r"/content/drive/MyDrive/sony_IITPatna/folder_1/items.csv") as file:
    directors= set()
    count=0
    for line_no, line in enumerate(file):
        if line_no==0: continue
        items= list(csv.reader([line]))[0]
     #   print(items,"\n")
        id= int(items[7])
        genres= [int(x) for x in items[12:31]]
        director= items[33].lstrip(" ").rstrip(" ")
        if len(director)==0:
            print("skiping_for_director")
            continue
        rating= items[34].lstrip(" ").rstrip(" ")
       # print("ratings1: ",rating,"\n")
        if len(rating)==0:
            print("skiping_for_ratings")
            continue
        #a = type(rating)
        #print(a,"\n")
        try:
          rating= float(rating)
        except:
          rating = 8.7
        #   print("An exception occurred")
        # if type(rating) == type(""):
        #     print(rating,"\n")
        
        # rating= float(rating)
        casts= items[32].lstrip(" ").rstrip(" ").split("|")
        if len(casts) == 0:
            print("skiping_for_casts")
            continue
        summary= items[31]

        yt_id= items[37]

        op_dict.update({
            id: {
                "genre": genres,
                "director": director,
                "rating": rating,
                "casts": casts,
                "summary": summary,
                "yt-id": yt_id
            }
        })
    file.close()

# print(len(op_dict))
# object_op_path= path_of("../objs/meta_raw.obj")
# store_pkl(op_dict, object_op_path)

skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_director
skiping_for_d

In [None]:
len(op_dict)

1484

# Process Meta

In [None]:


# lines= ["how are you", "i am cool", "what about you"]

# res= kg_builder.build_and_encode(lines)
# print(res.shape)

#meta_raw_path= path_of("../objs/meta_raw.obj")
meta_raw= op_dict

meta_processed_op_oath= "/content/drive/MyDrive/sony_IITPatna/movielens_meta_processed.obj"

w2v_file_path = "/content/drive/MyDrive/sony_IITPatna/word_vector_300d.vec"

lines_to_take= 3


def filter_and_preprocess_lines(lines_arr):
    res= list()
    for l in lines_arr:
        l= l.lstrip(" ").rstrip(" ")
        if len(l)==0: continue
        if len(l.split(" "))<4: continue
        res.append(l)
    return res

def concat_encoded(encoded_arr):
    res= list()
    for item in encoded_arr:
        res.extend(item)
    return res

def avg_encode(encoded_arr):
    res= [0 for x in range(len(encoded_arr[0]))]
    for enc in encoded_arr:
        for ind, val in enumerate(enc):
            res[ind]+=val
    for index in range(len(res)):
        res[index]= res[index]/len(encoded_arr)
    return res



def encode_description(meta_raw, lines_to_take):
    all_lines= []
    all_movie_id= list(meta_raw.keys())

    lines_to_take_while_encoding= 8

    for movie_id in all_movie_id:
        movie_meta= meta_raw.get(movie_id)
        description= movie_meta.get("summary")
        lines_raw= description.split(".")
        lines_filtered= filter_and_preprocess_lines(lines_raw)
        lines= to_same_shape(lines_filtered, lines_to_take_while_encoding)
        all_lines.extend(lines)
    
    all_encoded= build_and_encode(all_lines)

    # print(len(all_movie_id), len(all_encoded))

    res= dict()
    for ind, movie_id in enumerate(all_movie_id):
        encoded= all_encoded[ind*lines_to_take_while_encoding : ind*lines_to_take_while_encoding+lines_to_take]

        encoded= concat_encoded(encoded)

        res.update({
            movie_id: encoded,
        })
    return res



# encode_description(meta_raw, lines_to_take)



def encode_director(meta_raw):

    all_movie_id= list(meta_raw.keys())

    director_split_vectors= dict()

    for movie_id in all_movie_id:
        movie_meta= meta_raw.get(movie_id)
        director_name= movie_meta.get("director")
        director_split= director_name.split(" ")
        for d in director_split:
            d= d.lower()
            director_split_vectors.update({
                d: False
            })


    with open(w2v_file_path) as file:
        for line in file:
            line= line.rstrip("\n")
            items= line.split(" ")
            word= items[0].lower()
            if director_split_vectors.get(word) is not None:
                vector= [float(x) for x in items[1:]]
                director_split_vectors.update({
                    word: vector,
                })
        file.close()

    director_encoding= dict()

    for movie_id in all_movie_id:
        movie_meta= meta_raw.get(movie_id)
        director_name= movie_meta.get("director")
        director_split= director_name.split(" ")
        enc= list()
        for d in director_split:
            d= d.lower()
            if director_split_vectors.get(d):
                enc.append(director_split_vectors.get(d))

        enc= to_same_shape(enc, 2)
        enc= avg_encode(enc)

        director_encoding.update({
            movie_id: enc,
        })

    return director_encoding

        

# encode_director(meta_raw)


def encode_genre(meta_raw):
    all_movie_id= list(meta_raw.keys())

    genre_encoded= dict()

    for movie_id in all_movie_id:
        movie_meta= meta_raw.get(movie_id)
        genre= movie_meta.get("genre")
        genre_encoded.update({
            movie_id: genre,
        })
    return genre_encoded
        
# encode_genre(meta_raw)


def encode_rating(meta_raw):
    all_movie_id= list(meta_raw.keys())

    rating_encoded= dict()

    for movie_id in all_movie_id:
        movie_meta= meta_raw.get(movie_id)
        rating= movie_meta.get("rating")

        rating= rating/10

        rating_encoded.update({
            movie_id: [rating],
        })
    return rating_encoded

# encode_rating(meta_raw)


def encode_everything():

    enc_desc= encode_description(meta_raw, lines_to_take)
    print("[*] Description encoding completed...\n")
    enc_genre= encode_genre(meta_raw)
    print("[*] Genre encoding completed...\n")
    enc_direc= encode_director(meta_raw)
    print("[*] Director encoding completed...\n")
    enc_rate= encode_rating(meta_raw)
    print("[*] Rating encoding completed...\n")

    metadata_encoded= dict()

    all_movie_id= list(meta_raw.keys())
    for movie_id in all_movie_id:
        de= enc_desc.get(movie_id)
        ge= enc_genre.get(movie_id)
        di= enc_direc.get(movie_id)
        ra= enc_rate.get(movie_id)

        enc_concated= list()
        enc_concated.extend(de)
        enc_concated.extend(ge)
        enc_concated.extend(di)
        enc_concated.extend(ra)
        
        print(len(enc_concated))
        metadata_encoded.update({
            movie_id: enc_concated,
        })

    print(metadata_encoded.get(1))
    store_pkl(metadata_encoded, meta_processed_op_oath)
    return metadata_encoded

    #store_pkl(metadata_encoded, meta_processed_op_oath)
      

In [None]:
a = encode_everything()

100%|██████████| 11872/11872 [04:07<00:00, 47.89it/s]
999995it [00:38, 25913.41it/s]


[*] Description encoding completed...

[*] Genre encoding completed...

[*] Director encoding completed...

[*] Rating encoding completed...

1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
1220
122