In [1]:
import sys
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from __future__ import division
from collections import Counter
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
wv = KeyedVectors.load('embs_train.kv')

In [3]:
def sentence_embedding(sentence, word_counts):
    tokens = sentence.split()
    embeddings = []
    for token in tokens:
        if token in wv and word_counts[token] > 1:
            embeddings.append(wv[token])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(wv.vector_size)

In [4]:
def read_from(dataframe):
    for i, row in dataframe.iterrows():
        label = 1 if row['Label'] == '+' else -1
        yield (label, row['Review_Embedding'])

In [5]:
def test(dev_data, model):
    tot, err = 0, 0
    for label, emb in read_from(dev_data):
        err += label * (np.dot(model, emb)) <= 0
    return err / len(dev_data)

In [6]:
def train_with_svm(train_data, dev_data):
    scaler = StandardScaler()  
    X_train = np.array(train_data['Review_Embedding'].tolist())  
    X_dev = np.array(dev_data['Review_Embedding'].tolist())

    y_train = train_data['Label'].map({'+': 1, '-': -1})  
    y_dev = dev_data['Label'].map({'+': 1, '-': -1})

    
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)

    best_dev_err = 1.0  
    best_svm_model = None  

    
    c_values = [0.1]

    for c in c_values:
        start_time = time.time() 
        svm = SVC(kernel='linear', C=c, random_state=42)
        svm.fit(X_train, y_train)
        end_time = time.time()

       
        dev_err = 1 - svm.score(X_dev, y_dev)

        print(f"C = {c}, dev_err = {dev_err * 100:.2f}%, Time: {end_time - start_time:.2f} seconds")

       
        if dev_err < best_dev_err:
            best_dev_err = dev_err
            best_svm_model = svm

    print(f"Best dev error rate: {best_dev_err * 100:.2f}%")

    return best_svm_model

In [7]:
if __name__ == "__main__":
    train_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/train.txt', sep='\t', header=None, names=['Label', 'Review'])
    dev_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/dev.txt', sep='\t', header=None, names=['Label', 'Review'])
    
    word_counts = Counter()
    for review in train_data['Review']:
        word_counts.update(review.split())
        
    
    train_data['Review_Embedding'] = train_data['Review'].apply(lambda x: sentence_embedding(x, word_counts))
    dev_data['Review_Embedding'] = dev_data['Review'].apply(lambda x: sentence_embedding(x, word_counts))

    # Train with SVM
    best_svm_model = train_with_svm(train_data, dev_data)

C = 0.1, dev_err = 23.40%, Time: 44.16 seconds
Best dev error rate: 23.40%
