In [35]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import zero_one_loss as J01
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

import scipy

import sklearn.tree as tree
import math

from pathlib import Path

# Fix the random seed for reproducibility
# !! Important !! : do not change this
seed = 1234
np.random.seed(seed)


DATA_PATH = Path('data/original-dataset')
VOCAB_PATH = DATA_PATH.joinpath('imdb.vocab')
TRAIN_BOW_PATH = DATA_PATH.joinpath('train/labeledBow.feat')
TEST_BOW_PATH = DATA_PATH.joinpath('test/labeledBow.feat')
COLORMAP = 'seismic'

In [36]:
with open(VOCAB_PATH, 'r', encoding='utf8') as vocab_file:
    VOCAB = np.array(vocab_file.read().split('\n'))
    
print(VOCAB[:10])

['the' 'and' 'a' 'of' 'to' 'is' 'it' 'in' 'i' 'this']


In [58]:
# Read BOW file into X, Y
def get_data_from_bow(bow_file_path):
    """
    Returns a tuple (X, y):
        X is a sparse document term matrix of size m reviews by n terms
        y is a numpy array of size mx1 of review labels
    """
    # to build sparse matrix X
    row = []
    col = []
    data = []
    # to build labels y
    y = []

    with open(bow_file_path, 'r', encoding='utf8') as train_bow_file:    
        for (doc_idx, line) in enumerate(train_bow_file):
            review = line.split(' ')
            rating = int(review[0])
            y.append(1 if rating > 5 else -1)

            for bow_count in review[1:]:
                vocab_idx, count = bow_count.split(':')
                vocab_idx = int(vocab_idx)
                count = int(count)
                row.append(doc_idx)
                col.append(vocab_idx)
                data.append(count)

    X = scipy.sparse.csr_matrix((data, (row, col)), dtype=np.float32)
    y = np.array(y)
    
    return (X, y)

In [59]:
# PCA
Xtr, Ytr = get_data_from_bow(TRAIN_BOW_PATH)
print(Xtr.shape)


pipe = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=10, algorithm='arpack'))
    # add learners here to the pipeline here. Don't call .fit() on test data
]).fit(Xtr, Ytr)

print(pipe['tfidf'].idf_)
print(pipe['tfidf'].n_features_in_)
print(pipe['svd'].singular_values_)
print(pipe['svd'].components_)
transform = pipe.transform(X)

# 


(25000, 89527)
[ 1.0083544  1.0340933  1.0336381 ... 10.433524  10.433524  10.433524 ]
89527
[55.329884  14.397222  10.538151  10.043274   9.404477   8.429554
  8.235899   8.091376   7.6170015  7.269477 ]
[[ 5.0814509e-01  2.4986804e-01  2.5067049e-01 ...  6.1058035e-06
   1.7402563e-05  1.1357053e-05]
 [-3.3597150e-01 -8.2165316e-02 -8.0620795e-02 ... -3.0703446e-05
  -2.8154551e-05 -1.8037717e-05]
 [-1.2522189e-01  6.9059923e-02  8.6233623e-02 ...  3.6079331e-05
  -2.9601859e-05 -1.0663361e-05]
 ...
 [-6.9671266e-02  5.9843354e-02  1.8856047e-01 ... -2.8348524e-05
   5.9734521e-05  1.8816822e-06]
 [-2.6120454e-02 -3.6962113e-01  2.4091801e-01 ... -6.2934703e-07
  -5.8884074e-05 -8.2857459e-06]
 [ 1.0918891e-01 -2.1676689e-01 -3.9587843e-01 ...  1.2467788e-05
   6.7808309e-05  2.4440062e-05]]


In [None]:
# PCA Analysis
print(transform.shape)
print(transform)

svd_components = pipe['svd'].components_

labeled_pcs = []
for (i, component) in enumerate(svd_components):
    terms = [(val, VOCAB[i]) for (i, val) in enumerate(component)]
    terms.sort(reverse=True)
    print(f'PRINCIPLE COMPONENT {i}')
    print('Most important terms')
    print(terms[:10])
    print()
    print('Least important terms')
    print(terms[-10:])

# # first = np.array(X[:10])
# # second = np.array(X[-10:])

# # print(first)
# # print('-------------------------------')
# # print(second)

# # Xsmall = np.concatenate((first, second))[:,:50]
# # Xsmall = Xsmall / Xsmall.sum(axis=1,keepdims=1)

# k = 2
# U, S, Vh = scipy.sparse.linalg.svds(Xsparse, k=k) # X0 = U * diag(S) * Vh
# S = np.diag(S)

# print(U)
# print('--------')
# print(S)
# print('--------')
# print(Vh)

# Usmall = np.delete(U, np.s_[30:-30], axis=0)
# Vhsmall = np.delete(Vh, np.s_[50:-50], axis=1)

# print(Usmall.shape)
# print(Vhsmall.shape)

# fig, ax = plt.subplots()
# ax.imshow(Usmall.dot(S), cmap=COLORMAP)

# fig, ax = plt.subplots()
# ax.imshow(Vhsmall, cmap=COLORMAP)


# Xhatsmall = Usmall.dot( S ).dot( Vhsmall ) # approx using k largest eigendir
# print(Xhatsmall.shape)
# # Xsmall = first_and_last(X, 20)
# # print(Xsmall.shape)
# # Xhat_small = first_and_last(Xhat, 20)
# # print(Xsmall.shape)

# Xsmall = np.delete(np.delete(X, np.s_[30:-30], axis=0), np.s_[50:-50], axis=1)
# print(Xsmall.shape)

# fig, ax = plt.subplots()
# im = ax.imshow(Xsmall, cmap=COLORMAP, vmax=0.1, vmin=-0.1)

# fig, ax = plt.subplots()
# im = ax.imshow(Xhatsmall, cmap=COLORMAP, vmax=1, vmin=-1)
