In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
data = pd.read_csv("../input/southparklines/All-seasons.csv")
data.head()
# data.info()

***
## Preprocessing
Only two columns that I want to keep are:
* "Character": that is the character who speaks;
* "Line": that is the raw text from the line of dialogue.

In [None]:
# Remove column "Season" 
data = data.drop(['Season'], axis = 1)
# Remove column "Episode" 
data = data.drop(['Episode'], axis = 1)
data.head()

In [None]:
data.shape

In [None]:
# Checking for missing values
data.isnull().sum()

### Cleaning data
First, I have to remove the stopwords and non-alphabetic characters for each line of dialogue.
I followed a tutorial to use spaCy library to preprocessing data and speed up cleaning process.

![NLP_Pipeline](http://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg)

In [None]:
from IPython.display import Image
# load english language model. Disable Named Entity Recognition ('ner') and 'parser' in Natural Language Processing (nlp) for speed (check the image)
nlp = spacy.load('en', disable=['ner', 'parser'])

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    # token.lemma_ is the base form of the word (for example: token.text_= APPLE token.lemma_= apple)
    # token.is_stop is a boolean value that represent if the word is one of the most common words on the language(for example: "for", "is"..) 
    txt = [token.lemma_ for token in doc if not token.is_stop]
    ''' print the different parameter for token in doc
        for token in doc:
            print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
    '''
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
# Removes non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in data['Line'])
# Use spaCy.pipe() attribute to speed-up the cleaning process
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
# print(txt)
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
# New DataFrame contains data without duplicates in one column named "Clean"
data_clean = pd.DataFrame({'Clean': txt})
data_clean = data_clean.dropna().drop_duplicates()
data_clean.shape

In [None]:
data_clean.head()

In [None]:
# Detect bigrams (common phrases) from a list of sentences. For example 'mrs_garrison'
from gensim.models.phrases import Phrases, Phraser
# As Phrases() takes a list of list of words as input
sent = [row.split() for row in data_clean['Clean']]
# Creates the relevant phrases from the list of sentences
phrases = Phrases(sent, min_count=30, progress_per=10000)
# Export the trained model = use less RAM, faster processing
bigram = Phraser(phrases)
# Transform the corpus based on the bigrams detected
sentences = bigram[sent]

### Find most frequent words
Check the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams by printing most frequent words in dataset.

In [None]:
# The "defaultdict" will simply create any items that you try to access (provided of course they do not exist yet).
# This is useful to avoid that Python dictionary throws a KeyError if you try to get an item with a key that is not currently in the dictionary.
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

***
## Training model
Separate the training in 3 steps:
1. Word2Vec() : set up the parameters of the model one-by-one.
2. build_vocab() : builds the vocabulary from a sequence of sentences and thus initialized the model.
3. trains the model.

In [None]:
import multiprocessing

from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

### Set up parameters

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

### Building the vocabulary table

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

### Trains the model
* total_examples = int - Count of sentences;
* epochs = int - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

***
## Exploring the model
### Most similar to
Checking similarity between main characters of South Park and other words of dialogues.

In [None]:
w2v_model.wv.most_similar(positive=["eric"])

Let's see what the bigram "eric_cartman" gives by comparison:

In [None]:
w2v_model.wv.most_similar(positive=["eric_cartman"])

About Kenny:

In [None]:
w2v_model.wv.most_similar(positive=["kenny"])

About Chef:

In [None]:
w2v_model.wv.most_similar(positive=["chef"])

### Similarities
Similarity between two words:

In [None]:
w2v_model.wv.similarity("chef", 'singer')

In [None]:
w2v_model.wv.similarity("kyle", 'jewish')

### Odd-One-Out
Ask to the model to give us the word that does not belong to the list.

Which of these character is white?

In [None]:
w2v_model.wv.doesnt_match(['chef', 'token_black', 'stanley'])

Which of these is not a mother?

In [None]:
w2v_model.wv.doesnt_match(['liane', 'sheila', 'bebe'])

The answer is Bebe Stevens.

### Analogy difference
Create a kind of proposition between words.
For example if 'bebe' is 'popular' which word does 'chef' match?

In [None]:
w2v_model.wv.most_similar(positive=["bebe", "popular"], negative=["chef"], topn=3)

In this case Bebe Stevens seems to be the "popular" girls' secondary leader, after Wendy Testaburger.
The algorithm have found the adjective most frequently used for Chef.

## t-SNE visualizations
t-SNE is a non-linear dimensionality reduction algorithm that attempts to represent high-dimensional data and the underlying relationships between vectors in a lower-dimensional space.
To make the visualizations more relevant, we will look at the relationships between a query word (in **red**), its most similar words in the model (in **blue**), and other words from the vocabulary (in **green**).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 19 dimensions with PCA
    reduc = PCA(n_components=19).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))
    

## 10 Most similar words vs. 8 Random words
Show with t-SNE visualization, where the vector representation of 'eric', his 10 most similar words from the model, as well as 8 random ones, lies in a 2D graph.

In [None]:
tsnescatterplot(w2v_model, 'eric', ['dog', 'bird', 'ah', 'kill', 'bob', 'hat', 'drink', 'bebe'])