# Initalization

In [1]:
from glob import glob
import os

import gensim
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

In [13]:
from colour import Color

In [22]:
def get_color_gradient(num_colors, start_color, end_color):
    """
    Given a number of steps to generate a color list over, return a list of hex-valued colors from start_color to end_color in num_colors steps.
    Useful for visualizing data that is continuous by shading each unique value of that data by a color that can correspond to its value.
    Example usage:
        get_color_gradient(5, colour.Color('white'), colour.Color('green'))
    
    :param num_colors: int
        The number of colors to generate a gradient for
    :param start_color: colour.Color object
        The color to start from
    :param end_color: colour.Color object
        The color to end at
        
    :return: list
        List of num_colors hex-valued colors from start_color to end_color, using the colour.Color.range_to method
    """
    
    return [c.get_hex_l() for c in start_color.range_to(end_color, steps=num_colors)]

## Load data

In [2]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, '../data/stanfordSentimentTreebank/')

In [3]:
data_dir

'/Users/dstone/projects/machine_learning/nlp/notebooks/../data/stanfordSentimentTreebank/'

In [4]:
dict_of_df = {}
for file in glob('{}/*.txt'.format(data_dir)):
    if 'readme' in file.lower():
        continue
    name = file.split('/')[-1].split('.')[0]
    dict_of_df[name] = pd.read_table(file, header=None)

In [9]:
dict_of_df = {}
# their data is in a different format for each txt file, so jeeze we have to parse each individually
# get phrases 
dict_of_df['phrases'] = pd.read_table('{}/dictionary.txt'.format(data_dir), header=None, delimiter='|', names=['phrase', 'phrase_id'], index_col=['phrase_id'])
dict_of_df['sentiments'] = pd.read_table('{}/sentiment_labels.txt'.format(data_dir), header=1, delimiter='|', names=['phrase_id', 'score'], index_col=['phrase_id'])

# now the sentences
dict_of_df['sentences'] = pd.read_table('{}/datasetSentences.txt'.format(data_dir), header=1, delimiter='\t', names=['sentence_id', 'sentence'])#, index_col=['sentence_id'])
dict_of_df['split'] = pd.read_table('{}/datasetSplit.txt'.format(data_dir), header=1, delimiter=',', names=['sentence_id', 'split'])#, index_col=['sentence_id'])

# now match phrases to sentences. I don't know how to use the parser yet. Do this later

In [10]:
# match the content of the phrases to their sentiment scores
df_phrases = dict_of_df['phrases'].merge(dict_of_df['sentiments'], left_index=True, right_index=True)

In [11]:
dict_of_df['sentences'].head()

Unnamed: 0,sentence_id,sentence
0,2,The gorgeously elaborate continuation of `` Th...
1,3,Effective but too-tepid biopic
2,4,If you sometimes like to go to the movies to h...
3,5,"Emerges as something rare , an issue movie tha..."
4,6,The film provides some great insight into the ...


In [12]:
# merge sentences to phrases, forget parsing (this loses a lot of data)
# we need the sentence ID because this gives us the split to use for training/dev/test
df_phrases.merge(dict_of_df['sentences'], left_on='phrase', right_on='sentence')#.drop('sentence', axis=1)

Unnamed: 0,phrase,score,sentence_id,sentence
0,", The Sum of All Fears is simply a well-made a...",0.888890,4860,", The Sum of All Fears is simply a well-made a..."
1,", `` They 're out there ! ''",0.611110,7251,", `` They 're out there ! ''"
2,", is a temporal inquiry that shoulders its phi...",0.694440,5477,", is a temporal inquiry that shoulders its phi..."
3,- I also wanted a little alien as a friend !,0.694440,5576,- I also wanted a little alien as a friend !
4,"- West Coast rap wars , this modern mob music ...",0.763890,2338,"- West Coast rap wars , this modern mob music ..."
5,- greaseballs mob action-comedy .,0.361110,7166,- greaseballs mob action-comedy .
6,- spy action flick with Antonio Banderas and L...,0.166670,11305,- spy action flick with Antonio Banderas and L...
7,- style cross-country adventure ... it has spo...,0.708330,11409,- style cross-country adventure ... it has spo...
8,-- but certainly hard to hate .,0.611110,7163,-- but certainly hard to hate .
9,-- but it makes for one of the most purely enj...,0.819440,2732,-- but it makes for one of the most purely enj...


In [None]:
gensim.

# t-SNE clustering with word2vec by sentiment

In [None]:
# Need to embed the words first 

In [47]:
model = gensim.models.Word2Vec(X, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

NameError: name 'X' is not defined