In [1]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = '/Users/IlanaWeinstein/Desktop/Machine-Learning-Project/Data'
train = pd.DataFrame(pd.read_csv(data_path + '/' + 'train.csv', sep=','))
dev = pd.DataFrame(pd.read_csv(data_path + '/' + 'dev.csv', sep=','))

In [3]:
X_train = train.drop(['label'], axis=1)
y_train = train['label']

X_val = dev.drop(['label'], axis=1)
y_val = dev['label']

In [4]:
train_texts, train_labels = X_train['review'], y_train
val_texts, val_labels     = X_val['review'], y_val

## Data Processing

The task is to create bag-of-words features: tokenize the text, index each token, represent the sentence as a dictionary of tokens and their counts, limit the vocabulary to $n$ most frequent tokens. In the lab we use built-in `sklearn` function, `sklearn.feature_extraction.text.CountVectorizer`. 
**In this HW, you are required to implement the `Vectorizer` on your own without using `sklearn` built-in functions.**

Function `preprocess_data` takes the list of texts and returns list of (lists of tokens). 
You may use [spacy](https://spacy.io/) or [nltk](https://www.nltk.org/) text processing libraries in `preprocess_data` function. 

Class `Vectorizer` is used to vectorize the text and to create a matrix of features.


In [5]:
# !pip3 install spacy
!python3 -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
import spacy
import en_core_web_sm
# nlp = en_core_web_sm.load()

In [6]:
from tqdm import tqdm

def preprocess_data(data):
    
    preprocessed_data = []
#     nlp = spacy.load("en_core_web_sm")
    nlp = en_core_web_sm.load()
    
    #new to spacy, decided to not change any settings
    for text in tqdm(data):
        doc = nlp(text)
        new_list = []
        for token in doc:
            new_list.append(token.text)
        preprocessed_data.append(new_list)
    

    return preprocessed_data

In [29]:
test_train = train_texts[:int(.3 * len(train_texts))]
test_val = val_texts[:int(.3 * len(val_texts))]

In [30]:
# takes  while to run 

# train_data = preprocess_data(train_texts)
# print('Done train')
# val_data = preprocess_data(val_texts)

train_data = preprocess_data(test_train)
print('Done train')
val_data = preprocess_data(test_val)

100%|██████████| 75262/75262 [32:36<00:00, 38.47it/s]  


Done train


100%|██████████| 10775/10775 [04:40<00:00, 38.42it/s]


In [31]:
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        # Create a vocab list, self.vocab_list, using the most frequent "max_features" tokens
        
        vocab_master = np.array([ elem for row in dataset for elem in row])
        
        word_list,count = np.unique(vocab_master,return_counts = True)
        word_list_sorted = word_list[np.argsort(-count)]
        
        self.vocab_list = word_list_sorted[:self.max_features]

        # Create a token indexer, self.token_to_index, that will return index of the token in self.vocab_list
        self.token_to_index = {}

        for i,word in enumerate(self.vocab_list):
            self.token_to_index[word] = i
        
        pass

    def transform(self, dataset):
        # This function transforms text dataset into a matrix, data_matrix
        """
        YOUR CODE GOES HERE
        """
        #dictionary, append count of words only in that row
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i,row in enumerate(dataset):
            for word in row:
                data_matrix[i][self.token_to_index.get(word)] += 1
  
        
        return data_matrix

In [48]:
max_features = 2000 # TODO: Replace None with a number
vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
# X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels[:75262]) #remove slice with full dataset 
y_val = np.array(val_labels[:10775])
# y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [49]:
X_train

array([[ 6.,  4.,  2., ...,  2.,  2.,  2.],
       [ 9.,  7.,  9., ...,  5.,  5.,  5.],
       [ 4.,  3.,  3., ...,  1.,  1.,  1.],
       ...,
       [11., 10.,  8., ...,  6.,  6.,  6.],
       [16.,  8.,  8., ...,  8.,  8.,  8.],
       [11.,  9.,  6., ...,  6.,  6.,  6.]])

In [50]:
vocab

array(['.', 'the', ',', ..., '[', 'frozen', 'Being'], dtype='<U227')

## Model

We train logistic regression model and save prediction for train, val and test.

In [51]:
from sklearn.linear_model import LogisticRegression

# Define Logistic Regression model
model = LogisticRegression(random_state=0, solver='liblinear')

# Fit the model to training data
model.fit(X_train, y_train)

# Make prediction using the trained model
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
# y_test_pred = model.predict(X_test)

In [52]:
y_train_pred

array([0, 0, 0, ..., 0, 0, 0])

## Accuracy ## 

In [53]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [55]:
print(f"Training accuracy: {accuracy_score(y_train, y_train_pred):.3f}, "
      f"F1 score: {f1_score(y_train, y_train_pred):.3f}")
print(f"Validation accuracy: {accuracy_score(y_val, y_val_pred):.3f}, "
      f"F1 score: {f1_score(y_val, y_val_pred):.3f}")
# print(f"Test accuracy: {accuracy_score(y_test, y_test_pred):.3f}, "
#       f"F1 score: {f1_score(y_test, y_test_pred):.3f}")

Training accuracy: 0.901, F1 score: 0.051
Validation accuracy: 0.899, F1 score: 0.036
