# Movie Title Generation

In [22]:
import jieba
# import gensim
import pickle
import numpy as np
import os
import json
import pprint as pp
import re
from scipy import spatial
from TranslationTool.langconv import *
from hanziconv import HanziConv
from pycorenlp import StanfordCoreNLP

### Loading Models
**Don't run this block twice!!**

In [23]:
def loading():
    # load stopwords
    with open("stopwords.txt", encoding='utf8') as fp:
        dat = fp.read()
    global stop_words
    stop_words = dat.split('\n')
    del stop_words[-1]

    # load word2vec model
    global model
    # model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary = True, unicode_errors = 'ignore')
    model = pickle.load(open("model.pkl", "rb"))

    # load idf list
    global idf
    with open('idf.txt', encoding='utf8') as fp:
        dat = fp.read()
    lines = dat.split('\n')
    del lines[-1]
    idf = [(l.split()[0], float(l.split()[1])) for l in lines]
    
    # connect to StanfordCoreNLP
    global nlp
    nlp = StanfordCoreNLP('http://140.113.193.76:9000')
#     nlp = StanfordCoreNLP('http://140.113.208.179:8002')
    
    # feature database of evaluation part
    global f1_dict, mv_list_vec, f3_dict
    f1_dict = pickle.load(open("f1_dict.pkl", "rb"))
    mv_list_vec = pickle.load(open("f2_dict.pkl", "rb"))
    f3_dict = pickle.load(open("f3_dict.pkl", "rb"))
    
    return

# global variable with empty value as initialization
model = None
# model_path = "cna_asbc_cbow_d300_w10_n10_hs0_i15.vectors.bin"
nlp = None

stop_words = []
idf = []

f1_dict = {}
mv_list_vec = []
f3_dict = {}

# loading
loading()

### Reading File and Preprocessing

In [24]:
def check_contain_chinese(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

def readFile(path):
    
    segs = []
    
    with open(path, 'r', encoding="utf8") as f:
        for line in f:
            if check_contain_chinese(line):
                line = re.sub(r'[^\w]','',line) 
                seg_list = jieba.cut(line)
                if seg_list:
                    for s in seg_list:
                        if check_contain_chinese(s):
                            segs.append(s)
    
#     with open(path, encoding='utf8') as fp:
#         dat = fp.read()
    
#     lines = dat.split('\n')
#     del lines[-1]
    
#     # segmentation
#     segs = []
#     for line in lines:
#         tmp = [Converter('zh-hant').convert(w) for w in line.split('/')[:-1]]
#         segs.extend(tmp)
#         # jieba.cut(line, cut_all=False)
    
    return segs

### TFIDF Computing

In [25]:
def tfGen(segs):
    
    # compute tf (word count and frequency)
    words, counts = np.unique(segs, return_counts=True) # default: axis=None
    frequency = counts / len(segs)
    tf = list(zip(words, frequency))
    tf = sorted(tf, key=lambda x: x[1], reverse=True)
    
    return tf

def tfidfGen(tf):
    
    idf_dict = {word: value for word, value in idf}
    
    words = []
    values = []
    for word, value in tf:
        words.append(word)
        if word in idf_dict:
            values.append(value * idf_dict[word])
        else:
            values.append(value * 8)

    tfidf = list(zip(words, values))
    tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
    
    return tfidf

### Keyword Generation

In [26]:
def keywordExt(tfidf):
    
    global model, stop_words, num_keywords
    
    i = 0
    word_ls = []
    for word, count in tfidf:
        if i == num_keywords:
            break
        if word in stop_words:
            continue
        if word not in model:
            continue
        word_ls.append(word)
        i += 1
    
    return word_ls

def keywordSel(word_ls):
    
    # remove NR
    global nlp, stop_words
    
    new_word_ls = []
    for word in word_ls:
        
        if len(word) < 2 or len(word) > 3:
            continue
        
        flag = False
        for w in stop_words:
            if w in word:
                flag = True
        if flag:
            continue
        
        output = nlp.annotate(word, properties={
            'annotators': 'pos',
            'outputFormat': 'json'
        })
        pos = output['sentences'][0]['tokens'][0]['pos']
        if pos != 'NR':
            new_word_ls.append((word, pos))
            
    return new_word_ls

### Genre Classification (feature base, which is not used now)
 - feature generation
 - SVM classification model

In [27]:
# global variable with predefined value
num_keywords = 40

def featureGen(word_ls):
    
    # add feature from word2vec
    
    feature = list(np.zeros(300))
    for word in word_ls:
        feature += model.word_vec(word)
    
    return feature

def featureGen2(tfidf):
    
    global word_ls
    
    tfidf_dict = {word: value for word, value in tfidf}
    feature = []
    for word in word_ls:
        if word in tfidf_dict:
            feature.append(tfidf_dict[word])
        else:
            feature.append(0)
    
    
    return feature

### Genre Classification
 - rule base
 - with corresponding to keywords

In [35]:
# global variable with predefined value
# keyword definition: [[keyword, ...], [key alphabet, ...]]
script_keyword = {
    'action': [['特工', '殺手'], ['警', '賭']],
    'comedy': [['嘿咻'], ['裸', '妓', '屁']],
    'crime': [['暴力', '受害者', '罪犯', '犯罪'], ['毒']],
    'drama': [[], []],
#     'fantasy': [[], ['獸']],
    'super': [['征服', '變身', '換裝', '英雄', '超人', '拯救', '超能力'], []],
    'horror': [[], ['怪', '屍', '鬼']],
    'romance': [[], ['愛', '戀']],
    'sci_fi': [['星球', '星際', '太空', '時空', '星艦', '宇宙'], []],
    'war': [['坦克', '地雷'], ['軍', '戰']]
}

def ruleBaseClassify(word_ls):
    # print(word_ls)
    
    for word in word_ls:
        for key, content in script_keyword.items():
            if word in content[0]:
                return key
    
    for word in word_ls:
        for key, content in script_keyword.items():
            for sw in content[1]:
                if sw in word:
                    return key
    
#     for key, content in script_keyword.items():
#         if content[0] == []:
#             continue
#         for sw in content[0]:
#             if sw in word_ls:
#                 return key
    
#     word_ls_str = ''.join(word_ls)
#     for key, content in script_keyword.items():
#         if content[1] == []:
#             continue
#         for sw in content[1]:
#             if sw in word_ls_str:
#                 return key
    
    return 'drama'

### Title Candidate Generation

In [52]:
# global variable with predefined value
# rule definition: [POS, SPECIAL_WORD, ReverseOrNot]
special_rule = {
    'action': [['NN', '玩命', True], ['NN', '啟動', False], ['NN', '神鬼', True], ['NN', '遊戲', False]],
    'comedy': [['NN', '行不行', False]],
    'crime': [['NN', '檔案', False], ['NN', '風暴', False], ['NN', '風雲', False]],
    'drama': [['NN', '神鬼', True], ['NN', '佚事', False]],
#     'fantasy': [['NN', '神鬼', True]],
    'super': [['NN', '聯盟', False]],
    'horror': [['NN', '絕命', True], ['NN', '失控', True], ['VV', '鬼', True]],
    'romance': [['NN', '真愛', True]],
    'sci_fi': [['NN', '星際', True]],
    'war': [['NN', '重生', False], ['NN', '救援', False]]
}

def block(s1, s2):
    
    # too similar
    if s1 in model and s2 in model:
        if model.wv.similarity(s1, s2) > 0.5:
            # print('Too similar: %s, %s' %(s1, s2))
            return True
    
    # contain the same alphabet
    for ele in s1:
        if ele in s2:
            # print('Contain the same alphanet: %s, %s' %(s1, s2))
            return True
        
    return False

def titleCanGen(genre, word_pos_ls):
    
    global special_rule
    
    pos_dict = {}
    for word, pos in word_pos_ls:
        try:
            pos_dict[pos].append(word)
        except:
            pos_dict[pos] = [word]
    
    candidates = []
    
    for rule in special_rule[genre]:
        pos = rule[0]
        for word in pos_dict[pos]:
            if block(rule[1], word):
                continue
            if '之' in word:
                candidates.append(rule[1]+word)
            elif rule[2]:
                candidates.append(rule[1]+word)
            else:
                candidates.append(word+rule[1])
                
    
#     if genre is "drama":
#          for word1 in pos_dict['NN']:
#                 for word2 in pos_dict['NN']:
#                     if block(word1, word2):
#                         continue
#                     candidates.append(word1+word2)
    
    return candidates

### Title Evaluation

In [30]:
# global variable with predefined value
special_word = [HanziConv.toSimplified(rule[1]) for key, content in special_rule.items() for rule in content]
pos_simpify_dic = {'NN': 'N', 'NR': 'N', 'NT': 'N', 'VE': 'V', 'VV': 'V'}

############################## parse.py ##############################

def Simpify(pos):
    global pos_simpify_dic
    if pos in pos_simpify_dic.keys():
        return pos_simpify_dic[pos]
    return pos

def Parse_String(mvname):
    parse_results = []
    # convert to simple chinese
    text = HanziConv.toSimplified(mvname)
    # parse by core nlp
    global nlp
    output = nlp.annotate(text, properties={'annotators': 'tokenize, ssplit, pos', 'outputFormat': 'json'})
    parse_results.append(output)
    return parse_results

def Get_Parse_Result(parse_results):
    # print(parse_results[0]['sentences'][0]['parse'])
    l = []
    for i in range(len(parse_results)):
        for word in parse_results[i]['sentences'][0]['tokens']:
            global special_word
            if word['word'] in special_word:
                l.append((word['word'], 'N'))
            else:
                l.append((word['word'], Simpify(word['pos'])))
    return l

def POStag(name):
    parse_results = Parse_String(name)
    l = Get_Parse_Result(parse_results)
    return l

############################## feature2.py ##############################

def cosine(v1, v2):
    res = 1 - spatial.distance.cosine(v1, v2)
    return res

def Create_mv_vector(mv_name):
    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    # model = models.Word2Vec.load('med250.model.bin')
    global model
    
    sum_vec = [0] * 250
    for word in mv_name:
        try:
            v = model.wv[word]
            sum_vec = sum_vec + v
        except KeyError:
            sum_vec = sum_vec
    
    return sum_vec

def Most_similar(mv_name):
    mv_name = [HanziConv.toTraditional(x) for x in mv_name]
    mv_vec = Create_mv_vector(mv_name)

    # caculate mv_name_vector
    global mv_list_vec
    # mv_list_vec = pickle.load(open("f2_dict.pkl", "rb"))
    
    # Similarity with those movies
    cosine_list = []
    for mv in mv_list_vec:
        cosine_list.append(cosine(mv_vec, mv))
        # print(cosine(mv_vec, mv))
    return max(cosine_list)

############################## main.py ##############################

def evaluation(mv_name):

    # print(mv_name)
    mv_name = HanziConv.toSimplified(mv_name)
    
    global f1_dict, f3_dict, model
    
    # POS tag
    parse_result = POStag(mv_name)
    # print(parse_result)

    # f1 score
    mv_words = [x[0] for x in parse_result]
    pos_form = [x[1] for x in parse_result]
    pos_form = tuple(pos_form)
    
    if pos_form in f1_dict:
        f1 = f1_dict[pos_form]
    else:
        f1 = 0

    # f2 score
    f2 = Most_similar(mv_words)
    if np.isnan(f2):
        f2 = 0

    # f3 score
    # words = [x[0] for x in parse_result]
    f3 = 0
    for word in mv_words:
        if word in f3_dict:
            f3 = f3 + 0.2 + f3_dict[word]
        else:
            f3 = 0

    # print("f1: %f, f2: %f, f3: %f" %(f1, f2, f3))
    score = f1 + f2 + f3
    # print("score = ", score)
    
    return score

### Title Generation
 - main process of title generation
 - input: a file path
 - output: a sorted title list

In [43]:
def titleGen(path):
    
    # read file
    segs = readFile(path)
    
    # tfidf
    tf = tfGen(segs)
    tfidf = tfidfGen(tf)

    # keyword
    word_ls = keywordExt(tfidf)
    word_pos_ls = keywordSel(word_ls)
    word_ls = [word for word, pos in word_pos_ls]
#     pp.pprint(word_ls)

    # get genre
    genre = ruleBaseClassify(word_ls)
#     print("%-10s" %genre, end='')

    # title candidate generation
    title_candidates = titleCanGen(genre, word_pos_ls)
#     pp.pprint(title_candidates)

    # evaluation
    title_score = [(title, evaluation(title)) for title in title_candidates]
    title_score = sorted(title_score, key=lambda x: x[1], reverse=True)

    return title_score, genre

### Title Selection
Give some random probability when choosing the final title.

In [58]:
def titleSel(title_score, genre):
    
    global special_rule
    
    lottery = []
    
    for rule in special_rule[genre]:
        for i, (title, score) in enumerate(title_score):
            if i > 10:
                break
            if rule[1] in title:
                lottery.append(i)
                break
    
    rand = np.random.choice(len(lottery))
    
    return lottery[rand]

### Task 1 Main Process
Put all the testing script file in one folder, and then find the best title for each movie.

In [61]:
# folder path
folder = "input"
files = os.listdir(folder)
output_ls = []

for file in files:
    path = folder + '/' + file
    title_score, genre = titleGen(path)
    idx = titleSel(title_score, genre)
    print("%-10s%-50s%-10s%-10f%-5d" %(genre, file, title_score[idx][0], title_score[idx][1], idx))
    output_ls.append(file.replace('.srt', '')+'\t'+title_score[idx][0])

filename = "task1_group7.txt"
with open(filename, 'w', encoding="utf8") as fp:
    fp.write('\n'.join(output_ls)+'\n')

  if sys.path[0] == '':
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


drama     1.srt                                             黑道佚事      0.910647  0    
romance   2.srt                                             真愛電影      0.951503  0    
drama     3.srt                                             乙班佚事      0.780227  10   


### Task 2 Main Process

In [33]:
filename = "task2_input.txt"
with open(filename, 'r', encoding="utf8") as fp:
    dat = fp.read()

lines = [line.split('\t') for line in dat.split('\n') if line != '']

for i, (idx, title) in enumerate(lines):
    score = evaluation(title)
    lines[i].append(score)

title_sorted = sorted(lines, key=lambda x: x[2], reverse=True)
pp.pprint(title_sorted)

filename = "task2_group7.txt"
with open(filename, 'w') as fp:
    tmp = [ele[0] for ele in title_sorted]
    fp.write('\n'.join(tmp)+'\n')

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


[['1', '美女與野獸', 1.6192695806755202],
 ['4', '遠山戀人', 1.5851929310885704],
 ['3', '正義聯盟', 1.4546812367109343],
 ['5', '英雄傳說', 1.4096095531229911],
 ['16', '神鬼城市', 1.3952205212761872],
 ['2', '死亡筆記本', 1.3409252141755643],
 ['8', '小蘋果', 0.64399005491279393],
 ['9', '食物好吃', 0.50193247611417313],
 ['7', '可愛的威廉', 0.45059613577058788],
 ['10', '你好嗎', 0]]


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [62]:
folder_ls = os.listdir("../11")
file_ls_ls = [os.listdir("../11/"+folder) for folder in folder_ls]   # default: path=None
# print([len(file_ls) for file_ls in file_ls_ls])

filename_ls = []
for i, folder in enumerate(folder_ls):
    for filename in file_ls_ls[i]:
        if filename in filename_ls:
            continue
        path = "../11/" + folder + "/" + filename
        try:
            title_score, genre = titleGen(path)
            idx = titleSel(title_score, genre)
            print("%-10s%-50s%-10s%-10f%-5d" %(genre, filename, title_score[idx][0], title_score[idx][1], idx))
        except:
            print("Empty!! ", filename)
        filename_ls.append(filename)

  if sys.path[0] == '':
  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


war       Afanda.txt                                        騎士救援      1.359132  0    


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


action    Alien.txt                                         神鬼大夫      1.181068  10   
drama     Baahubali 2_ The Conclusion.txt                   神鬼勞爾      0.812281  0    
drama     Big Hero 6.txt                                    芥末佚事      0.876833  1    
action    Bonnie and Clyde.txt                              女服遊戲      1.256678  0    
war       Captain America_ Civil Wa.txt                     戰士重生      0.971355  9    
action    Chugyeogja.txt                                    混蛋遊戲      1.452126  0    
Empty!!  Dangal.txt
super     Deadpool.txt                                      死侍聯盟      1.503168  0    
action    Die Hard.txt                                      牛仔遊戲      1.403586  0    
war       Dunkirk.txt                                       漲潮救援      1.156566  0    
war       Edge of Tomorrow.txt                              戰士救援      1.278029  0    
sci_fi    Guardians of the Galaxy Vol. 2.txt                星際地球      1.358349  0    
sci_fi    Guardians of the Galaxy.

drama     Dead Poets Society.txt                            俱樂部佚事     1.027244  1    
Empty!!  In Bruge.txt
drama     Jab We Me.txt                                     女孩佚事      0.846770  2    
romance   Kal Ho Naa Ho.txt                                 真愛初戀      1.004945  0    
drama     La Dolce Vita.txt                                 外套佚事      0.868729  1    
drama     La La Land.txt                                    佚事之城      1.317270  3    
drama     Life Is Beautiful.txt                             公主佚事      1.053171  3    
drama     Manhattan.txt                                     公司佚事      0.856618  1    
Empty!!  Mr. Smith Goes to Washington.txt
Empty!!  My Sassy Girl.txt
sci_fi    PK.txt                                            星際醉漢      1.043975  0    
drama     Sing Stree.txt                                    音樂佚事      1.014978  2    
war       The Big Lebowsk.txt                               安息日重生     0.789014  6    
drama     The Intouchable.txt                    

horror    Gremlin.txt                                       失控怪物      1.228125  0    
crime     I Am Legend.txt                                   蝴蝶風雲      1.370370  0    
sci_fi    I.txt                                             星際密碼      1.363598  0    
crime     Invasion of the Body Snatche.txt                  脊髓炎風暴     1.296159  3    
drama     Let the Right One In.txt                          頭豬佚事      0.847702  2    
drama     Marty.txt                                         神鬼殉道者     0.856085  1    
horror    Night of the Living Dead.txt                      絕命怪物      1.193925  2    
crime     Only Lovers Left Alive.txt                        毒蠅風暴      1.280215  1    
drama     Poltergei.txt                                     內褲佚事      0.864489  1    
drama     Psycho.txt                                        倫敦佚事      0.959405  2    
war       Re-Animato.txt                                    風暴重生      0.984047  8    
drama     Rosemary's Baby.txt                         