In [1]:
import os
import sys
from gensim.models.fasttext import FastText
import pandas as pd
import numpy as np
import re
import time
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

print("==================================")
print("Libraries are imported.")
print("FastText commentaries started to fit .")


def get_directory_content(path, extension):
    """ Returns directory content of a particular extension """
    matches = []
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            if filename.endswith(('.{}'.format(extension))):
                matches.append(os.path.join(root, filename))
    return matches


def load_file(filename):
    """ Loads the files with comments.
        Returns label and the dataset. """
    label = filename.split('/')[-1][:-12]
    return label, pd.read_json(filename)


def word_to_vec_to_fisher(sentence, model, gmm):
    """ Transforms list of words of a comment into a """
    storage = []
    for word in sentence:
        try:
            if word:
                word = re.sub('r[^a-zA-Z]+', "", str(word)).lower()
            else:
                continue
            model[word].any()
            storage.append(model[word])
        except Exception as e:
            pass
    if not storage:
        print(sentence, storage)
        return -999 # changed here
        
    xx = np.atleast_2d(storage)
    N = xx.shape[0]

    # Compute posterior probabilities.
    Q = gmm.predict_proba(xx)  # NxK

    # Compute the sufficient statistics of descriptors.
    Q_sum = np.sum(Q, 0)[:, np.newaxis] / N
    Q_xx = np.dot(Q.T, xx) / N
    Q_xx_2 = np.dot(Q.T, xx ** 2) / N

    # Compute derivatives with respect to
    # mixing weights, means and variances.
    d_pi = Q_sum.squeeze() - gmm.weights_
    d_mu = Q_xx - Q_sum * gmm.means_
    d_sigma = ( - Q_xx_2 - Q_sum * gmm.means_ ** 2 + Q_sum * gmm.covariances_ + 2 * Q_xx * gmm.means_)

    # Merge derivatives into a
    # vector.
    return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))


def clean(lst):
    if lst:
        return lst
    else:
        return 0

Libraries are imported.
FastText commentaries started to fit .


In [2]:
# filename = 'Insult'
# scalernames = ["SS", "MM"]
# stime = time.time()
# model = FastText.load("Fasttext100/FastText100.bin")
# print("Model Loaded.")
# path = os.getcwd() + '/balanced/'
# filenames = get_directory_content(path, 'json')
# for idx, f in enumerate(filenames):
#     # Fast Text (word2vec)
#     label = f.split('/')[-1][:-12]
#     print(filename, f)
#     if filename not in f:
#         continue
#     fstime = time.time()
#     label, df = load_file(f)

#     print(label, 'has columns: ', df.columns)
#     print("File {label} loaded.".format(label=label))
#     # Fisher's Vector
#     # for K in [32, 64]:
#     for K in [32]:#, 64]:
#         gmm = GaussianMixture(n_components=K, covariance_type='diag', reg_covar=1e-4,verbose=1)
#         gmm.fit(model.wv.vectors)
#         target = df[df['comment_text'].map(len) > 0].iloc[:, -1] # removed empty rows
#         # word to vect - > to fisher
#         comments = df['comment_text'][df['comment_text'].map(len) > 0].apply(
#             lambda lst: word_to_vec_to_fisher(lst, model, gmm))   # remove empty rows
#         # now remove -999 !!!!
#         # and remove the same rows from target
#         indeces = comments[comments.apply(type) != np.ndarray].index
#         target = target.drop(indeces)
#         comments = comments.drop(indeces)
#         print('Length target:', target.shape[0])
#         print('Length comments:', comments.shape[0])
#         comments = np.stack(comments)
#         for idx, scaler in enumerate([StandardScaler, MinMaxScaler]):
#             # normalise
#             scaled = pd.DataFrame(scaler().fit_transform(comments))
#             pca = PCA(n_components=1000)
#             pca_data = pd.DataFrame(pca.fit_transform(scaled))
#             # save the file
#             pd.concat([pca_data.reset_index(drop=True), target.reset_index(drop=True)],
#                       axis=1).to_csv("FT100FV{K}/{filename}{K}{scaler}ft100.csv".format(
#                           filename=filename,
#                           K=K,
#                           scaler=scalernames[idx]))
#         print("{label} K={K} Scaler={scaler} csv file saved. Time:{time}".format(
#                   label=label, K=K, scaler=scalernames[idx], time=(time.time()-fstime)))
#         print("Time spent: {time}".format(time=(time.time()-stime)))
#         print("All files saved.")

In [2]:
# settings
filename = 'Toxic'
fasttextmodelpath = "Fasttext100/FastText100.bin"
K = 32
scaleri = 0

# code
scalers = [StandardScaler, MinMaxScaler]
scaler = scalers[scaleri]
scalernames = ["SS", "MM"]
stime = time.time()
model = FastText.load(fasttextmodelpath)
print("Model Loaded.")
path = os.getcwd() + '/balanced/'
filenames = get_directory_content(path, 'json')
for idx, f in enumerate(filenames):
    # Fast Text (word2vec)
    label = f.split('/')[-1][:-12]
    print(filename, f)
    if filename not in f:
        continue
    fstime = time.time()
    label, df = load_file(f)

    print(label, 'has columns: ', df.columns)
    print("File {label} loaded.".format(label=label))
    # Fisher's Vector
    gmm = GaussianMixture(n_components=K, covariance_type='diag', reg_covar=1e-4,verbose=1)
    gmm.fit(model.wv.vectors)
    target = df[df['comment_text'].map(len) > 0].iloc[:, -1] # removed empty rows
    # word to vect - > to fisher
    comments = df['comment_text'][df['comment_text'].map(len) > 0].apply(
        lambda lst: word_to_vec_to_fisher(lst, model, gmm))   # remove empty rows
    # now remove -999 !!!!
    # and remove the same rows from target
    indices = comments[comments.apply(type) != np.ndarray].index
    del df
    target = target.drop(indices)
    comments = comments.drop(indices)
    print('Length target:', target.shape[0])
    print('Length comments:', comments.shape[0])
    print("Before stacking: ", comments.isnull().values.any())
    comments = np.stack(comments).astype('float16')
    # normalise
    print("Before normaliztion: ", np.isnan(comments).sum())
    comments = pd.DataFrame(scaler().fit_transform(comments)).fillna(0)
    print("After normaliztion", comments.isnull().values.any())
    pca = PCA(n_components=1000)

Model Loaded.
Toxic /home/ao2u17/Desktop/FinalProject/Anton/balanced/ThreatOptimal.json
Toxic /home/ao2u17/Desktop/FinalProject/Anton/balanced/InsultOptimal.json
Toxic /home/ao2u17/Desktop/FinalProject/Anton/balanced/IdentityOptimal.json
Toxic /home/ao2u17/Desktop/FinalProject/Anton/balanced/ObsceneOptimal.json
Toxic /home/ao2u17/Desktop/FinalProject/Anton/balanced/ToxicOptimal.json
Toxic has columns:  Index(['comment_text', 'toxic'], dtype='object')
File Toxic loaded.
Initialization 0
  Iteration 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
Initialization converged: True




['194', '249'] []
['160'] []
['8235'] []
['160'] []
['122'] []
Length target: 183473
Length comments: 183473
Before stacking:  False
Before normaliztion:  0
After normaliztion False
Toxic /home/ao2u17/Desktop/FinalProject/Anton/balanced/SevereOptimal.json


In [3]:
print("After normaliztion", comments.isnull().values.any())
comments = comments.replace(np.inf, 0) # replaces inf with 0
comments = pd.DataFrame(pca.fit_transform(comments))
# save the file
pd.concat([comments.reset_index(drop=True), target.reset_index(drop=True)],
          axis=1).to_csv("FT100FV{K}/{filename}{K}{scaler}ft100.csv".format(
              filename=filename,
              K=K,
              scaler=scalernames[scaleri]))
print("{label} K={K} Scaler={scaler} csv file saved. Time:{time}".format(
          label=label, K=K, scaler=scalernames[scaleri], time=(time.time()-fstime)))
print("Time spent: {time}".format(time=(time.time()-stime)))
print("All files saved.")


After normaliztion False
Severe K=32 Scaler=SS csv file saved. Time:1159.9676609039307
Time spent: 1170.92329454422
All files saved.


In [26]:
# for iix, iin in enumerate(comments.max(axis=0)):
#     if iin == np.inf:
#         print(iix, iin)

In [5]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('comments', 1467784104),
 ('target', 2935592),
 ('GaussianMixture', 2000),
 ('PCA', 1464),
 ('MinMaxScaler', 1184),
 ('scaler', 1184),
 ('FastText', 1056),
 ('StandardScaler', 1056),
 ('clean', 136),
 ('get_directory_content', 136),
 ('load_file', 136),
 ('word_to_vec_to_fisher', 136),
 ('filenames', 128),
 ('f', 116),
 ('path', 98),
 ('np', 80),
 ('pd', 80),
 ('scalernames', 80),
 ('scalers', 80),
 ('fasttextmodelpath', 76),
 ('indices', 64),
 ('gmm', 56),
 ('model', 56),
 ('pca', 56),
 ('label', 55),
 ('filename', 54),
 ('K', 28),
 ('idx', 28),
 ('scaleri', 28),
 ('fstime', 24),
 ('stime', 24)]

In [6]:
1467784104 / 1000000000

1.467784104