<a href="https://colab.research.google.com/github/savinkova-mariia/NLP/blob/main/NLP_lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from os import getcwd
import numpy as np

filePath = f"{getcwd()}/content/utils.py"
nltk.data.path.append(filePath)
from utils import build_freqs

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
df = pd.read_table('/content/reviews.tsv')

df_new = df.drop(df[(df["rating"] == 0.0)].index)

In [79]:
X = df_new.review_text
y = df_new.rating

train_pos, test_pos, train_neg, test_neg = train_test_split(X, y, test_size=0.2)

In [80]:
train_x = pd.concat([train_pos, train_neg], axis=1)
test_x = pd.concat([test_neg, test_pos], axis=1)

In [81]:
train_y = np.append(np.ones((len(pos_train), 1)), np.zeros((len(neg_train), 1)), axis=0)
test_y = np.append(np.ones((len(pos_test), 1)), np.zeros((len(neg_test), 1)), axis=0)

print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

freqs = build_freqs(train_x, train_y)

train_y.shape = (525644, 1)
test_y.shape = (131412, 1)


In [82]:
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 2


# Logistic Regression

In [83]:
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''

    h = h = 1 / (1 + np.exp(-z))
    return h

In [84]:
# Testing your function 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
else:
    print('Oops again!')

SUCCESS!
CORRECT!


In [85]:
# verify that when the model predicts close to 1, but the actual label is 0, the loss is a large positive value
-1 * (1 - 0) * np.log(1 - 0.9999) # loss is about 9.2

# verify that when the model predicts close to 0 but the actual label is 1, the loss is a large positive value
-1 * np.log(0.0001) # loss is about 9.2

9.210340371976182

In [86]:
# UNQ_C2 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''

    m = len(x)
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x, theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = -float(1)/(m) * (np.dot(np.transpose(y),np.log(h)) + np.dot(np.transpose(1-y),np.log(1-h)))
        # update the weights theta
        theta = theta - (alpha / m) * (np.dot(x.transpose(),(h - y)))
        
    J = float(J)
    return J, theta

In [87]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


# Extracting the features

In [88]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def extract_features(review, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_review(review)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in word_l:
        
        if (word,1.0) in freqs:
          x[0,1] += freqs.get((word,1.0),0)
        
        if (word,0.0) in freqs:
          x[0,2] += freqs.get((word,0.0),0)

        else:
          #print("Key is not found in the dictionary")
          pass
          
        
    assert(x.shape == (1, 3))
    return x

In [89]:
# Check your function

# test 1
# test on training data
tmp1 = extract_features(train_x[0], freqs)
print(tmp1)

# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

KeyError: ignored