## NLP using Co-occurrence Matrix and Singular Value Decomposition

In [None]:
import pandas as pd
import numpy as np
import itertools
import nltk
from nltk.corpus import stopwords
# from numpy.linalg import svd
from sklearn.decomposition import TruncatedSVD

In [None]:
data = pd.read_csv('200Reviews.csv')

In [None]:
print(data.head(10))
data.shape 

   Unnamed: 0         id  sentiment  \
0           0   "5814_8"          1   
1           1   "2381_9"          1   
2           2   "7759_3"          0   
3           3   "3630_4"          0   
4           4   "9495_8"          1   
5           5   "8196_8"          1   
6           6   "7166_2"          0   
7           7  "10633_1"          0   
8           8    "319_1"          0   
9           9  "8713_10"          1   

                                              review  
0  "With all this stuff going down at the moment ...  
1  "\"The Classic War of the Worlds\" by Timothy ...  
2  "The film starts with a manager (Nicholas Bell...  
3  "It must be assumed that those who praised thi...  
4  "Superbly trashy and wondrously unpretentious ...  
5  "I dont know why people think this is such a b...  
6  "This movie could have been very good, but com...  
7  "I watched this video at a friend's house. I'm...  
8  "A friend of mine bought this film for £1, and...  
9  "<br /><br />This

(200, 4)

In [None]:
#Function to concatenate data in array into a single string
def concatenate_array_data(array):
    result= ''
    for element in array:
        result += str(element)
    return result


In [None]:
para = concatenate_array_data(data.review.values)

In [None]:
#Step1: Sentence segmentation

sentences = nltk.sent_tokenize(para) #list of sentences

#Step2: Tokenization
list_of_list_words = [nltk.RegexpTokenizer(r'\w+').tokenize(i) for i in sentences] #produces list of sentences of words AKA list of lists of words

In [None]:
list_of_words = list(itertools.chain.from_iterable(list_of_list_words)) #change the list of lists of words to a single list of all words in the sentences
list_of_words = list(dict.fromkeys(list_of_words))  #to filter out duplicated words

In [None]:
print(len(list_of_words))

8022


In [None]:
#Step3: Stop word removal
def stopwords_filter(list_oflists_tokens):
    stopWords = set(stopwords.words('english'))
    filtered_word_list = []
    filtered_listoflists = []
    for word_list in list_oflists_tokens:
        filtered_word_list = [word for word in word_list if any(stopword in word for stopword in stopWords)]
        filtered_listoflists.append(filtered_word_list)
    return filtered_listoflists

filtered_sentences = stopwords_filter(list_of_list_words)
filtered_sentences


[['With',
  'all',
  'this',
  'stuff',
  'going',
  'down',
  'at',
  'the',
  'moment',
  'with',
  'i',
  've',
  'started',
  'listening',
  'to',
  'his',
  'music',
  'watching',
  'the',
  'odd',
  'documentary',
  'here',
  'and',
  'there',
  'watched',
  'The',
  'Wiz',
  'and',
  'watched',
  'Moonwalker',
  'again'],
 ['Maybe',
  'i',
  'just',
  'want',
  'to',
  'get',
  'a',
  'certain',
  'insight',
  'into',
  'this',
  'guy',
  'who',
  'i',
  'thought',
  'was',
  'really',
  'cool',
  'in',
  'the',
  'eighties',
  'just',
  'to',
  'maybe',
  'make',
  'up',
  'my',
  'mind',
  'whether',
  'he',
  'is',
  'guilty',
  'or',
  'innocent'],
 ['Moonwalker',
  'is',
  'part',
  'biography',
  'part',
  'feature',
  'film',
  'which',
  'i',
  'remember',
  'going',
  'to',
  'see',
  'at',
  'the',
  'cinema',
  'when',
  'it',
  'was',
  'originally',
  'released'],
 ['Some',
  'of',
  'it',
  'has',
  'subtle',
  'messages',
  'about',
  's',
  'feeling',
  'towards'

In [None]:
# Step4: Creation of co-occurrence matrix
length = len(list_of_words)
cooc = np.zeros((length, length),int)

In [None]:
###updating co-occurrence matrix for each sentence
def process_sentence(sentence):
    list_of_indices = [list_of_words.index(word) for word in sentence]
    for index1 in list_of_indices:
        for index2 in list_of_indices:
            if abs(index1 - index2)<=4 and index1!=index2: #window size of 5
                cooc[index1,index2] +=1

In [None]:
for sentence in filtered_sentences:
    process_sentence(sentence)

In [None]:
cooc #co-occurrence matrix final output

array([[ 0,  5,  4, ...,  0,  0,  0],
       [ 5,  0, 71, ...,  0,  0,  0],
       [ 4, 71,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [None]:
# Step 5: Applying Singular Value Decomposition
# SVD for word embeddings of size 100
svd = TruncatedSVD(n_components = 100)
svd.fit(cooc)
result = svd.transform(cooc)
print(result)

[[ 2.53059455e-01  4.18103618e-02 -3.87150785e-16 ...  2.60494853e-04
   5.85496172e-04  4.94173949e-04]
 [ 2.94589905e+00 -6.31961298e-01 -4.86920163e-15 ...  2.59339967e-04
   7.14376237e-04  6.12475264e-04]
 [ 2.10423946e+01  1.70633826e+01 -3.72091815e-15 ...  4.00626184e-05
   1.18220335e-04  7.98245129e-05]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-2.75555869e-24  3.69779255e-24  8.28426643e-25 ... -4.15429768e-06
  -4.12139309e-06  2.60904055e-06]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [None]:
result.shape

(8022, 100)