In [1]:
import sys
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
wv = KeyedVectors.load('embs_train.kv')

In [3]:
def sentence_embedding(sentence, word_counts):
    tokens = sentence.split()
    embeddings = []
    for token in tokens:
        if token in wv and word_counts[token] > 1:
            embeddings.append(wv[token])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(wv.vector_size)

In [4]:
def replace_question_mark(predictions, testfile):
    with open(testfile, 'r') as file:
        lines = file.readlines()

    with open("test.txt.predicted", 'w') as file:
        for i, line in enumerate(lines):
            if line.startswith('?'):
                sign = '+' if predictions[i] == 1 else '-'
                line = f"{sign}{line[1:]}"
            file.write(line)

In [5]:
def train_with_svm(train_data, test_data):
    word_counts = Counter()
    for review in train_data['Review']:
        word_counts.update(review.split())
        
    train_data['Review_Embedding'] = train_data['Review'].apply(lambda x: sentence_embedding(x, word_counts))
    test_data['Review_Embedding'] = test_data['Review'].apply(lambda x: sentence_embedding(x, word_counts))

    X_train = np.array(train_data['Review_Embedding'].tolist())
    X_test = np.array(test_data['Review_Embedding'].tolist())

    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    y_train = train_data['Label'].map({'+': 1, '-': -1})

    best_svm_model = None  
    
    c_value = 0.1  

    start_time = time.time() 
    svm = SVC(kernel='linear', C=c_value, random_state=42)
    svm.fit(X_train, y_train)
    end_time = time.time()

    print(f"Training Time: {end_time - start_time:.2f} seconds")

    predictions = svm.predict(X_test)
    replace_question_mark(predictions,'C:/3rd term/ML/HW-4/hw4-data/test.txt')

    return svm

In [6]:
if __name__ == "__main__":
    train_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/train.txt', sep='\t', header=None, names=['Label', 'Review'])
    test_data = pd.read_csv('C:/3rd term/ML/HW-4/hw4-data/test.txt', sep='\t', header=None, names=['Label', 'Review'])

   
    best_svm_model = train_with_svm(train_data, test_data)

Training Time: 36.44 seconds
