# import libs

In [2]:
import numpy as np
import pandas as pd 
import os

# Function to load dataset

In [3]:
def load_data(data_directory):
    #input : data_directory --> path to data folder --> = '../aclimdb'
    #Returns : test and train data as pd dataframe
    data ={}
    for split in ['train','test']:
        data[split] = []
        for sentiment in ['neg','pos']:
            score = 1 if sentiment == 'pos' else 0
            path = os.path.join(data_directory, split, sentiment)
            file_names =  os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path,f_name),'r') as f:
                    review = f.read()
                    data[split].append([review,score])
        
    np.random.shuffle(data['train'])
    data['train'] = pd.DataFrame(data['train'], columns=['text','sentiment'])
        
    np.random.shuffle(data['test'])
    data['test'] = pd.DataFrame(data['test'], columns=['text','sentiment'])
        
    return data['train'],data['test']

# load data with load_data fuction

In [4]:
train_data,test_data = load_data(data_directory = './../../data/imdb-data')
train_data.head()

Unnamed: 0,text,sentiment
0,I have been reading the reviews for this movie...,0
1,"I was 16 when I first saw the movie, and it ha...",1
2,"Every once in a while , someone out of the blu...",0
3,"Nicely done, and along with ""New voyages"" it's...",1
4,The first mistake you make in titling a film i...,0


# Sklearn libs

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer


# Clean_text function

In [6]:
import re
def clean_text(text):
    """
    Applies preprocesing to a text
        steps:
            - removing HTML tags
            - removing punctuations
            - lowering text case
    """
    #remove HTML tags 
    text = re.sub(r'<.*?>', '',text)
    
    #removing characters [\\],['],[\"]
    text = re.sub(r"\\\\", "", text)   
    text = re.sub(r"\\'", "", text)   
    text = re.sub(r"\\\"", "", text)   
    
    text = text.strip().lower()
    # replace punctuation characters with spaces\n",
    filters='!"\\'#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n'
    translate_dict = dict((c,'\'') for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [7]:
vectorizer = CountVectorizer(stop_words='english',preprocessor = clean_text)
training_features = vectorizer.fit_transform(train_data['text'])
test_features = vectorizer.transform(test_data['text'])

In [8]:

model = LinearSVC()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 83.55


