In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, RegexpTokenizer 
from nltk.stem import WordNetLemmatizer 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '#', printEnd = "\r"):
    
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [None]:
def tokenization(raw_text, tokenizer, stop_words):
    """
    tokenize raw text DOCO
    """
    return [token.lower() for token in tokenizer.tokenize(raw_text) if token.lower() not in stop_words]

def lemmatization(tokens, lemmatizer):
    """
    lemmatize tokens
    """
    return [lemmatizer.lemmatize(token) for token in tokens]

def bag_of_wordization(lemmas):
    bow = {}
    for lemma in lemmas:
        if lemma in bow:
            bow[lemma] += 1
        else:
            bow[lemma] = 1
    return bow

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()

In [None]:
input_df = pd.read_csv('/kaggle/input/mpst-movie-plot-synopses-with-tags/mpst_full_data.csv')

In [None]:
input_df = input_df[input_df.plot_synopsis.str.len() < input_df.plot_synopsis.str.len().quantile(0.9)]

In [None]:
input_df[input_df['title'].str.contains('Paranormal')]

In [None]:
bag_of_words_list = []


def cond(i, row, bag_of_words_list):
    if row['title'] in [title for title, _ in bag_of_words_list]:
        return False
    
    if (i < 400):
        return True
    
    if ('harry' in row['title'].lower()):
        return True
    
    if ('paranormal' in row['title'].lower()):
        return True
    
    if ('ring' in row['title'].lower()):
        return True 
    
    if (('war' in row['title'].lower())):
        return True
    
    if ('scary' in row['title'].lower()):
        return True

    if ('star' in row['title'].lower()):
        return True
    
    return False
    
n = input_df.shape[0]
print("Bag of wording {} movies".format(n))
printProgressBar(0, n, prefix = 'Progress:', suffix = 'Complete', length = 50)
for i, row in input_df.iterrows():
#     printProgressBar(i, n, prefix = 'Progress:', suffix = 'Complete', length = 50)
    if cond(i, row, bag_of_words_list):
        raw_text = row['plot_synopsis'] + " " + row['tags']
        title = row['title']
        tokens = tokenization(raw_text, tokenizer, stop_words)
        lemmas = lemmatization(tokens, lemmatizer)
        bag_of_words = bag_of_wordization(lemmas)
        
        bag_of_words_list.append((title, bag_of_words))
printProgressBar(n, n, prefix = 'Progress:', suffix = 'Complete', length = 50)

In [None]:
df = pd.DataFrame()

n = len(bag_of_words_list)
print("Loading {} movies".format(n))
printProgressBar(0, n, prefix = 'Progress:', suffix = 'Complete', length = 50)
for i, bag_of_words in enumerate(bag_of_words_list):
    printProgressBar(i, n, prefix = 'Progress:', suffix = 'Complete', length = 50)
    
    name, bow = bag_of_words 
    df = df.append(pd.DataFrame(data=bow, index=[name]))

df = df.fillna(0)    
printProgressBar(n, n, prefix = 'Progress:', suffix = 'Complete', length = 50)

In [None]:
df

In [None]:
import plotly.express as px

from sklearn import preprocessing
from sklearn.manifold import TSNE

In [None]:
# X_embedded = TSNE(perplexity=100, n_iter=5000, learning_rate=30).fit_transform(df)
X_embedded = dict()
perplexity_range = [150,200,300]

for perplexity in perplexity_range:
    print(perplexity)
    X_embedded[perplexity] = TSNE(perplexity=20, n_iter=1000).fit_transform(df)

In [None]:
df_embedded = dict()

for perplexity in perplexity_range:
    df_embedded[perplexity] = pd.DataFrame(data=X_embedded[perplexity], index=df.index)

In [None]:
for perplexity in perplexity_range:
    
    df_plot = df_embedded[perplexity].reset_index()
    df_plot['Sagas'] = ""
    df_plot.loc[df_plot['index'].str.upper().str.contains('HARRY POTTER'), 'Sagas'] = "Harry Potter"
    df_plot.loc[df_plot['index'].str.upper().str.contains('LORD OF THE RING'), 'Sagas'] = "Lord of The Rings"
    df_plot.loc[df_plot['index'].str.upper().str.contains('STAR WARS'), 'Sagas'] = "Star Wars"
    df_plot.loc[df_plot['index'].str.upper().str.contains('SCARY MOVIE'), 'Sagas'] = "Scary Movie"
    df_plot.loc[df_plot['index'].str.upper().str.contains('PARANORMAL ACT'), 'Sagas'] = "Paranormal Activity"
    df_plot.loc[df_plot['index'].str.upper().str.contains('STAR TREK'), 'Sagas'] = "Star Trek"
    
    
    fig = px.scatter(df_plot, x=0, y=1, hover_name='index', color='Sagas', title='Perplexity: {}'.format(perplexity))
    fig.show()
    


In [None]:
len(input_df.loc[input_df['title']=='The Prestige', 'plot_synopsis'].to_list()[0])

In [None]:
input_df.loc[input_df['title']=="The A-Team", 'plot_synopsis'].to_list()

In [None]:
input_df.loc[input_df['title'].str.contains("Star Trek")]