In [None]:
import numpy as np
import pandas as pd

In [None]:
%%time
data = pd.read_csv("../input/all_data.csv")

In [None]:
data.info()

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
data = reduce_mem_usage(data)

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data["tags"]

In [None]:
tags = set()
for tag in data["tags"]:
    tags.update(tag.split())

In [None]:
print("tag length: {}".format(len(tags)))

In [None]:
from gensim.models import word2vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# ref: https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
corpus = [tag.split() for tag in data["tags"]]

In [None]:
model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)

In [None]:
# tsne_plot(model)

In [None]:
model.most_similar("smile", topn=30)

In [None]:
model.most_similar("angry", topn=30)

In [None]:
model.most_similar("happy", topn=30)

In [None]:
model.most_similar("^_^", topn=30)

In [None]:
model.most_similar("sad", topn=30)

In [None]:
model.most_similar("surprised", topn=30)

In [None]:
model.most_similar("shy", topn=30)

In [None]:
model.most_similar("crying", topn=30)

In [None]:
face_exp_tags = ["smile", "angry", "happy", "sad", "surprised", "shy", "crying"]

In [None]:
tags.intersection(set(face_exp_tags))

In [None]:
for tag in face_exp_tags:
    print(tag)
    print(data["tags"].str.contains(tag).sum())