In [None]:
import pandas as pd
import numpy as np
import statistics
import gensim
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import scipy

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [None]:
path = "../input/glove-global-vectors-for-word-representation/glove.6B.200d.txt"
final = glove2word2vec(path ,"./glove.6B.200d.txt")

In [None]:
glove = gensim.models.KeyedVectors.load_word2vec_format("./glove.6B.200d.txt")

In [None]:
def cosine_distance_wordembedding_method(sent1, sent2):
    vector_1 = np.mean([glove[word] for word in sent1 if word in glove],axis=0)
    vector_2 = np.mean([glove[word] for word in sent2 if word in glove],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    return 1-cosine

In [None]:
df1 = pd.read_csv("../input/the-movies-dataset/movies_metadata.csv")
df2 = pd.read_csv("../input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")
df1.head()

In [None]:
df1 = df1.rename(columns={"title":"Title"})
df1.head()

In [None]:
df2.head()

In [None]:
df3 = pd.merge(df1, df2, on="Title")
df3.head()

In [None]:
df3.isnull().sum()

In [None]:
df3 = df3.dropna(subset=["overview"]) 
df3.shape

In [None]:
lengths = []
for i in df3.overview:
    try:
        k = i.split(" ")
        lengths.append(len(k))
    except:
        continue

In [None]:
statistics.mean(lengths)

In [None]:
lengths1 = []
for i in df3.Plot:
    try:
        k = i.split(" ")
        lengths1.append(len(k))
    except:
        continue

In [None]:
statistics.mean(lengths1)

In [None]:
df_1995 = df3[df3["Release Year"].isin(["1995"])]
df_1995.head()

In [None]:
df_1995.shape

In [None]:
plots = []
for i in df3.Plot:
    try:
        k = i.split(" ")
        if 400 > len(k) > 300:
            plots.append(i)    
    except:
        continue

In [None]:
df_1995_final = df_1995[df_1995["Plot"].isin(plots)]
df_1995_final.shape

In [None]:
generated_overview = []
device = torch.device('cpu')
for i in df_1995_final.Plot:
    try:
        t5_prepared_Text = "summarize: "+ i
        tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
        summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=40,
                                    max_length=100,
                                    early_stopping=True)

        output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        generated_overview.append(output)
    except:
        pass
    

In [None]:
len(generated_overview)

In [None]:
df_1995_final["generated_overview"] = generated_overview
df_1995_final.head()

In [None]:
df_1995_final["overview_tokens"] = df_1995_final["overview"].apply(lambda x: x.split(" "))
df_1995_final["generated_overview_tokens"] = df_1995_final["generated_overview"].apply(lambda x: x.split(" "))
df_1995_final.head()

In [None]:
df_1995_final["cos_sim"] = df_1995_final.apply(lambda x: cosine_distance_wordembedding_method(x.generated_overview_tokens, x.overview_tokens), axis=1)
df_1995_final.head()

In [None]:
df_1995_final["cos_sim"].mean()

In [None]:
print(df_1995_final.iloc[0]["overview"],"\n\n", df_1995_final.iloc[0]["generated_overview"])

In [None]:
print(df_1995_final.iloc[1]["overview"],"\n\n", df_1995_final.iloc[1]["generated_overview"])

In [None]:
print(df_1995_final.iloc[2]["overview"],"\n\n", df_1995_final.iloc[2]["generated_overview"])