# Classes definition

In [17]:
import pandas as pd

class Review:
    def __init__(self, text, label):
        self.text = text
        self.label = label
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_label(self):
        return [x.label for x in self.reviews]


# Opening file and creating a collection of objects 

In [18]:
df = pd.read_csv("aclimdb_reviews_train.txt")
reviews = []
for i,row in df.iterrows():
    reviews.append(Review(df.loc[i, 'review'], df.loc[i, 'label']))
                   
print(reviews[5].text)
print(reviews[5].label)
                

Ouch! This one was a bit painful to sit through. It has a cute and amusing premise, but it all goes to hell from there. Matthew Modine is almost always pedestrian and annoying, and he does not disappoint in this one. Deborah Kara Unger and John Neville turned in surprisingly decent performances. Alan Bates and Jennifer Tilly, among others, played it way over the top. I know that's the way the parts were written, and it's hard to blame actors, when the script and director have them do such schlock. If you're going to have outrageous characters, that's OK, but you gotta have good material to make it work. It didn't here. Run away screaming from this movie if at all possible.
0


# Creating the test and train set

In [19]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)



In [23]:
train_x = train_container.get_text()
train_y = train_container.get_label()


test_x = test_container.get_text()
test_y = test_container.get_label()

8374

# Bag of words vectorization

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Each term found by the analyzer during the fit is assigned a unique integer
# index corresponding to a column in the resulting matrix

# tfid vectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) 
#train_x_vectors.toarray()

# count vectorizer
# vectorizer = CountVectorizer()
# train_x_vectors = vectorizer.fit_transform(train_x)
# train_x_vectors.toarray()

test_x_vectors = vectorizer.transform(test_x)

# Classification: linear svm

In [28]:
from sklearn import svm

clf_svm = svm.SVC(kernel= 'linear')

clf_svm.fit(train_x_vectors, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# Accurancy

In [29]:
print(clf_svm.score(test_x_vectors, test_y))

0.8952727272727272


In [30]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[0, 1])


array([0.89437653, 0.89615385])

# Testing

In [41]:
test_set = ['very fun', "bad", 'horrible waste of time', 'This film is awesome', 'I like this movie']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array([1, 0, 0, 1, 0], dtype=int64)