In [1]:
import numpy as np
import matplotlib as plt
from sklearn import neighbors
from sklearn.cluster import KMeans
import re
from collections import Counter
%matplotlib inline

In [2]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
def pre_process(doc):
    """
    pre-processes a doc
      * Converts the tweet into lower case,
      * removes the URLs,
      * removes the punctuations
      * tokenizes the tweet
      * removes words less that 3 characters
    """
    
    doc = doc.lower()
    # getting rid of non ascii codes
    doc = remove_non_ascii(doc)
    
    # replacing URLs
    url_pattern = "http://[^\s]+|https://[^\s]+|www.[^\s]+|[^\s]+\.com|bit.ly/[^\s]+"
    doc = re.sub(url_pattern, 'url', doc) 

    # removing dollars and usernames and other unnecessary stuff
    userdoll_pattern = "\$[^\s]+|\@[^\s]+|\&[^\s]+|\*[^\s]+|[0-9][^\s]+|\~[^\s]+"
    doc = re.sub(userdoll_pattern, '', doc)
    
    
    # removing punctuation
    punctuation = r"\(|\)|#|\'|\"|-|:|\\|\/|!|\?|_|,|=|;|>|<|\.|\@"
    doc = re.sub(punctuation, ' ', doc)
    
    return [w for w in doc.split() if len(w) > 2]

In [3]:
def construct_termdoc(docs, vocab=[]):
    """
    Construct a term-by-document-matrix
    
    docs: corpus
    vocab: pre-defined vocabulary
           if not supplied it will be automatically induced from the data
    
    returns the term-by-document matrix and the vocabulary of the passed corpus
    """
    
    # vocab is not passed
    if vocab == []:
        vocab = set()
        termdoc_sparse = []

        for doc in docs:       
            # computes the frequencies of doc
            doc_sparse = Counter(doc)    
            termdoc_sparse.append(doc_sparse)
            
            # update the vocab
            vocab.update(doc_sparse.keys())  

        vocab = list(vocab)
        vocab.sort()
    
    else:
        termdoc_sparse = []        
        for doc in docs:
            termdoc_sparse.append(Counter(doc))
            

    n_docs = len(docs)
    n_vocab = len(vocab)
    termdoc_dense = np.zeros((n_docs, n_vocab), dtype=int)

    for j, doc_sparse in enumerate(termdoc_sparse):
        for term, freq in doc_sparse.items():
            try:
                termdoc_dense[j, vocab.index(term)] = freq
            except:
                pass
            
    return termdoc_dense, vocab

In [4]:
# YOU ARE REQUIRED TO INSERT YOUR CODE IN THIS CELL

def Euclidean_distance(x,y):
    '''
    Compute and return the Euclidean distance between two vectors x and y
    '''
    # INSERT YOUR CODE HERE
    dist = (np.array(x) - np.array(y))*(np.array(x) - np.array(y))
    return np.sqrt(dist.sum())

In [5]:
# YOU ARE REQUIRED TO INSERT YOUR CODE IN THIS CELL
def cosine_distance(x,y):
    '''
    Compute and return the cosine distance between two vectors x and y
    '''
    # INSERT YOUR CODE HERE
    num = (x * y).sum()
    denom = np.sqrt((x * x).sum()) * np.sqrt((y * y).sum())
    num += 0.0    # or use np.astype(float) to make sure of float division
    return 1.0 - num/denom

In [6]:
# YOU ARE REQUIRED TO INSERT YOUR CODE IN THIS CELL
'''
The function takes the termdoc matrix as the input and computes variables called "euclidean_distance_matrix" 
and "cosine_distance_matrix", which are matrices whose elements (i,j) store the Eulidean distance 
and the cosine distance between tweet i-th and i-jth.

Hint: you should store the distance matrices in numpy arrays for easier implementation in subsequent tasks
'''

def compute_distance_matrices(termdoc):
    # INSERT YOUR CODE HERE
    n_rows = termdoc.shape[0]
    euclidean_distance = np.zeros((n_rows,n_rows))
    cosine_distance_matrix = np.zeros((n_rows,n_rows))
    for i in range(n_rows):
        for j in range(n_rows):
            euclidean_distance[i,j] = Euclidean_distance(termdoc[i,:],termdoc[j,:])
            cosine_distance_matrix[i,j] = cosine_distance(termdoc[i,:],termdoc[j,:])
    return euclidean_distance,cosine_distance_matrix

##### KMeans

In [21]:
x = np.array([[8,1],[0,0],[2,7]])
kmeans = KMeans(3)
kmeans.fit(x)
print(kmeans.predict(np.array([[3,4]])))



[2]


##### KNN

In [8]:
X_train = np.array([[1,2],[6,3],[3,7],[10,4]])
Y_train = np.array([1,3,2,4])
X_test = np.array([[6,8],[10,4]])
knn = neighbors.KNeighborsClassifier(1)
knn.fit(X_train, Y_train)
Y_test = knn.predict(X_test)
print(Y_test)

[2 4]


##### Linear Regression

In [9]:
y = 11.3*9 - 47.1
print(y)

54.6


##### Vector Space Model

|index|   vocab   |doc1|doc2|doc3|
|:---:|-----------|:--:|:--:|:--:|
|  1  |   goal    |  1 |  0 |  0 |
|  2  |   data    |  1 |  2 |  2 |
|  3  |information|  2 |  2 |  2 |
|  4  |  insight  |  1 |  0 |  0 |
|  5  |    you    |  0 |  2 |  2 |

In [10]:
keywords = ['goal', 'data', 'information', 'insight', 'you']
doc1 = 'The goal is to turn data into information and information into insight.'
doc2 = 'You can have data without information, but you cannot have information without data.'
doc3 = 'You can have data without information, but can you have information without data?'
doc = []
doc.append(doc1)
doc.append(doc2)
doc.append(doc3)
doc

['The goal is to turn data into information and information into insight.',
 'You can have data without information, but you cannot have information without data.',
 'You can have data without information, but can you have information without data?']

In [11]:
doc_processed =[]
for i in range(len(doc)):
  doc_processed.append(pre_process(doc[i]))

In [12]:
doc_termdoc, doc_vocab = construct_termdoc(doc_processed, keywords)

In [13]:
doc_termdoc.shape

(3, 5)

In [14]:
doc_vocab

['goal', 'data', 'information', 'insight', 'you']

In [15]:
doc_termdoc

array([[1, 1, 2, 1, 0],
       [0, 2, 2, 0, 2],
       [0, 2, 2, 0, 2]])

##### Euclidean Distances

In [16]:
n_rows = doc_termdoc.shape[0]
for i in range(n_rows):
  for j in range(n_rows):
    print("x: {}, y: {}".format(i,j))
    print('Euclidean: {}'.format(Euclidean_distance(doc_termdoc[i,:],doc_termdoc[j,:])))
    print()

x: 0, y: 0
Euclidean: 0.0

x: 0, y: 1
Euclidean: 2.6457513110645907

x: 0, y: 2
Euclidean: 2.6457513110645907

x: 1, y: 0
Euclidean: 2.6457513110645907

x: 1, y: 1
Euclidean: 0.0

x: 1, y: 2
Euclidean: 0.0

x: 2, y: 0
Euclidean: 2.6457513110645907

x: 2, y: 1
Euclidean: 0.0

x: 2, y: 2
Euclidean: 0.0



##### Cosine Distances

In [17]:
n_rows = doc_termdoc.shape[0]
for i in range(n_rows):
  for j in range(n_rows):
    print("x: {}, y: {}".format(i,j))
    print('Cosine: {}'.format(cosine_distance(doc_termdoc[i,:],doc_termdoc[j,:])))
    print()

x: 0, y: 0
Cosine: 1.1102230246251565e-16

x: 0, y: 1
Cosine: 0.3453463292920228

x: 0, y: 2
Cosine: 0.3453463292920228

x: 1, y: 0
Cosine: 0.3453463292920228

x: 1, y: 1
Cosine: -2.220446049250313e-16

x: 1, y: 2
Cosine: -2.220446049250313e-16

x: 2, y: 0
Cosine: 0.3453463292920228

x: 2, y: 1
Cosine: -2.220446049250313e-16

x: 2, y: 2
Cosine: -2.220446049250313e-16



##### Distance Matrix

In [18]:
doc_euc, doc_cos = compute_distance_matrices(doc_termdoc)

In [19]:
doc_euc

array([[0.        , 2.64575131, 2.64575131],
       [2.64575131, 0.        , 0.        ],
       [2.64575131, 0.        , 0.        ]])

In [20]:
doc_cos

array([[ 1.11022302e-16,  3.45346329e-01,  3.45346329e-01],
       [ 3.45346329e-01, -2.22044605e-16, -2.22044605e-16],
       [ 3.45346329e-01, -2.22044605e-16, -2.22044605e-16]])