## SVD--singular value decomposition

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize
from sklearn import decomposition

In [2]:
data = pd.read_csv("../input/IMDB_Dataset-experimented_folds.csv", nrows=10000)
data.head()

Unnamed: 0,review,sentiment,kfold
0,Some Plot Spoilers Ahead.<br /><br />The Nashv...,0,0
1,"Well, the big money machine has done it again!...",0,0
2,I had the opportunity to see this last evening...,1,0
3,This was a great movie. Something not only for...,1,0
4,"After the return of ""horror movies"" (come on S...",1,0


In [4]:
data.shape

(10000, 3)

In [5]:
# to perform word-tokenization & tf-idf i.e. assign a number to each word and float to represent 
# count for the given word..
tfid_vec = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

tfid_vec.fit(data.review)
corpus_transformed = tfid_vec.transform(data.review)

svd = decomposition.TruncatedSVD(n_components=10)

corpus_svd = svd.fit(corpus_transformed)

In [6]:
# total no.of features or say words in all sentences
len(tfid_vec.vocabulary_)

69754

In [7]:
# 10 dimension for 69754 words --- it's like representation of each word in 3D space
print(corpus_svd.components_.shape)  
corpus_svd.components_

(10, 69754)


array([[ 2.11891973e-05,  6.45000387e-02,  1.69998745e-03, ...,
         5.15878827e-05,  3.25089961e-05,  7.34884240e-05],
       [ 8.39443178e-06, -2.17267740e-02, -2.61033256e-04, ...,
        -1.69387208e-05, -1.06956835e-04,  1.41085496e-04],
       [-5.47600334e-05,  2.34816489e-01,  1.81292791e-03, ...,
        -2.72855516e-04,  1.12167476e-04, -3.82787008e-04],
       ...,
       [-5.32607522e-05,  7.88612972e-02,  6.95086579e-04, ...,
         7.00858233e-04,  1.33303500e-06, -9.62661935e-05],
       [ 1.10348374e-04,  1.04391578e-01, -1.40065782e-03, ...,
         1.16970106e-04,  3.07912080e-05, -2.81063281e-04],
       [ 1.06615395e-04,  4.22755842e-02, -1.78194026e-03, ...,
         6.50763307e-05,  3.42455534e-04, -3.00890894e-04]])

In [8]:
# gives list of words i.e features from the sentences
tfid_vec.get_feature_names()[0:10]

['\x10own', '!', '#', '$', '%', '&', "'", "''", "''the", "'."]

In [29]:
# both the features and 
sample_idx = 1
print(f"corpus_svd.components_ shape & type: \
      {corpus_svd.components_[sample_idx].shape, type(corpus_svd.components_[sample_idx])}")

print(f"  tfid_vec.get_feature_names() shape & type: \
      {len(tfid_vec.get_feature_names()), type(tfid_vec.get_feature_names())}")

corpus_svd.components_ shape & type:       ((69754,), <class 'numpy.ndarray'>)
  tfid_vec.get_feature_names() shape & type:       (69754, <class 'list'>)


***

***tagging each word with its corresponding svd.components_[idx] in a feature_scores dict***

****

In [12]:
sample_idx = 3
feature_scores = dict(
    zip(
        tfid_vec.get_feature_names(),
        corpus_svd.components_[sample_idx]
    )
)

In [23]:
feature_scores

{'\x10own': 8.195600980567362e-05,
 '!': 0.5251713159505206,
 '#': 0.004064304640979339,
 '$': 0.0034588294864140727,
 '%': 0.0012029170418493293,
 '&': 0.008195454485459958,
 "'": 0.01658464623009198,
 "''": 0.1435837383515021,
 "''the": 2.1587307689145e-05,
 "'.": -6.412729317463118e-05,
 "'01": 2.2497175452535033e-05,
 "'02": 0.0005281284965688834,
 "'03": 0.0005281284965688751,
 "'04": 0.0003919245718093165,
 "'06": -0.00022086633935236927,
 "'07": -0.0002625512251243801,
 "'10": 0.0008595350201676091,
 "'10'.": -5.457780764836273e-05,
 "'1408": 3.721574369114306e-06,
 "'20": -0.00021509123054375286,
 "'20s": 3.2489728920416526e-05,
 "'20th": 9.841079027915229e-05,
 "'24": -0.00018887875860509228,
 "'30": 0.00024405022558311187,
 "'30s": 0.00039866802927718183,
 "'30s-ray": 0.00010127058542892235,
 "'34": -0.00012684653063342447,
 "'36": 0.00010919846862000865,
 "'39": 0.0002545456912451885,
 "'3rd": -0.00014322932045092137,
 "'40": -0.0005242093493256617,
 "'40s": 0.00024143778332

In [137]:
N = 15
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['film', 'it', 'i', 'and', 'films', 'of', 'a', "''", '``', '-', 'is', 'as', 'my', 'great', 'very']


## understanding sorting

In [139]:
alx = {"'adventurer": -4.516184501441493e-05,
 "'afraid": -8.381190990525459e-05,
 "'after": 2.5228609727361848e-05,
 "'aftermath": 0.00011406752449731716,}

In [140]:
scores = sorted(alx, key=alx.get, reverse=True)
scores

["'aftermath", "'after", "'adventurer", "'afraid"]

### Looping through multiple idxs

In [144]:
for sample_idx in range(0,8):
    feature_scores = dict(
        zip(
            tfid_vec.get_feature_names(),
            corpus_svd.components_[sample_idx]
        )
    )
    
    print(sample_idx,"-" ,sorted(feature_scores, key=feature_scores.get, reverse=True)[:15])

0 - ['the', ',', '.', 'and', 'a', '>', '<', '/', 'br', 'of', 'to', 'i', 'is', 'it', 'in']
1 - ['>', '<', 'br', '/', ':', '*', 'it.', 'movie.', '-', 'film.', 'time.', 'one.', 'this.', 'me.', 'them.']
2 - ['i', 'movie', '!', 'it', 'was', 'this', '.', "n't", 'you', 'my', 'me', 'do', 'have', '...', 'did']
3 - [',', '!', "''", '``', '*', ')', '(', '...', '?', 'you', "'s", ':', "n't", 'if', 'do']
4 - ['!', 'the', 'of', '?', '...', ')', '-', '(', '``', "''", 'in', 'is', '..', 'and', '....']
5 - ['is', '.', 'a', 'he', 'her', 'she', '*', 'his', 'you', '!', 'and', "'s", 'has', 'to', 'him']
6 - ['*', "''", '``', 'was', 'i', ')', '(', '.', '?', 'were', 'did', '--', 'had', 'her', '1/2']
7 - ["''", '``', 'you', 'movie', 'is', 'are', 'this', 'bad', 'if', '.', 'movies', 'your', 'of', 'will', 'the']


## The above data doesn't make sense let's apply some cleaning..

***The clean_text function removes the unwanted whitespacing & unwanted punctuations from the text, as above we can see
so many punctuations which doesn't add any meaning to the context***

In [31]:
import string
import re
def clean_text(text):
    
    text = text.lower() 
       
    text = text.split()
    
    # to remove the unwanted whitespacing from the text
    # "Hello Sam,   How are you??"--->> 'Hello Sam, How are you??'
    text = " ".join(text)
    
    # to remove the punctuation from the text
    # 'Hello Sam, How are you??' -->> 'Hello Sam How are you'
    text = re.sub(f'[{re.escape(string.punctuation)}]', "", text)
    
    return text

s = "Hello Sam,    How are you??"
clean_text(s)

'hello sam how are you'

In [51]:
df = pd.read_csv("../input/IMDB_Dataset-experimented_folds.csv")

# apply to clean_text function
corpus = df.review.apply(clean_text)

tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

tfv.fit(corpus)

corpus_transformed = tfv.transform(corpus)

svd = decomposition.TruncatedSVD(n_components=10)

corpus_svd = svd.fit(corpus_transformed)

In [79]:
for sample_idx in range(0,10):
    
    feature_scores = dict(
        zip(
             tfv.get_feature_names(), corpus_svd.components_[sample_idx]
        )
    )
    
    N = 10
    print(f"{sample_idx} : {sorted(feature_scores, key=feature_scores.get, reverse=True)[:N]}")

0 : ['the', 'a', 'and', 'of', 'to', 'is', 'i', 'it', 'in', 'this']
1 : ['i', 'movie', 'it', 'was', 'this', 'you', 'my', 'me', 'have', 'if']
2 : ['a', 'is', 'her', 'she', 'he', 'you', 'his', 'and', 'to', 'br']
3 : ['her', 'was', 'she', 'i', 'he', 'his', 'and', 'him', 'had', 'to']
4 : ['br', 'they', 'movie', 'to', 'her', 'she', 'the', 'bad', 'he', 'was']
5 : ['movie', 'her', 'she', 'the', 'is', 'movies', 'bad', 'girl', 'this', 'acting']
6 : ['her', 'she', 'film', 'you', 'the', 'is', 'br', 'show', 'are', 'i']
7 : ['br', 'film', 'a', 'is', 'movie', 'this', 'very', 'was', 'good', 'great']
8 : ['and', 'i', 'show', 'is', 'br', 'love', 'great', 'my', 'movie', 'his']
9 : ['is', 'he', 'film', 'his', 'i', 'the', 'this', 'him', 'that', 'ever']
