In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('Data/reviews.csv')
df = df.sample(frac=0.01)

In [6]:
df['sentiment'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df

Unnamed: 0,review,sentiment
6922,Superb silent version of the story of Francois...,1
35230,"Well, they say nymphomania leaves you unsatisf...",0
8768,I wish they would just make a special section ...,0
10279,eXistenZ combines director David Cronenberg's ...,1
20370,I am huge movie enthusiast and also an active ...,1
...,...,...
29140,If this film is an accurate display of J. Smit...,0
35067,I thought maybe... maybe this could be good. A...,0
42012,I admire 'Kissing on the Mouth' for its frankn...,0
47130,I thought this was a beautiful movie- very bra...,1


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_tweet = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_tweet)

In [10]:
import re
import nltk

def preprocess(text):
    text = ' '.join(text.split())
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)

    return text

In [11]:
df['review'] = df['review'].apply(preprocess)
df

Unnamed: 0,review,sentiment
6922,superb silent version story francois villon al...,1
35230,well say nymphomania leaves unsatisfied dont k...,0
8768,wish would make special section video rental s...,0
10279,existenz combines director david cronenbergs t...,1
20370,huge movie enthusiast also active rugby player...,1
...,...,...
29140,film accurate display j smits acting skills th...,0
35067,thought maybe maybe could good early appearanc...,0
42012,admire kissing mouth frankness pubic hair cutt...,0
47130,thought beautiful movie brave beautiful imager...,1


In [17]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'] , 
                                   random_state=104,  
                                   test_size=0.1,  
                                   shuffle=True)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

x = vectorizer.fit_transform(df['review'])     

In [26]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(x, df['sentiment'] , 
                                   random_state=104,  
                                   test_size=0.1,  
                                   shuffle=True)

In [29]:
X_test

<50x1975 sparse matrix of type '<class 'numpy.float64'>'
	with 3189 stored elements in Compressed Sparse Row format>

In [30]:
import time
from sklearn import svm
from sklearn.metrics import classification_report

classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)
prediction_linear = classifier_linear.predict(X_test)

In [31]:
report = classification_report(y_test, prediction_linear, output_dict=True)

In [32]:
report

{'0': {'precision': 0.7647058823529411,
  'recall': 0.8666666666666667,
  'f1-score': 0.8125,
  'support': 30.0},
 '1': {'precision': 0.75,
  'recall': 0.6,
  'f1-score': 0.6666666666666666,
  'support': 20.0},
 'accuracy': 0.76,
 'macro avg': {'precision': 0.7573529411764706,
  'recall': 0.7333333333333334,
  'f1-score': 0.7395833333333333,
  'support': 50.0},
 'weighted avg': {'precision': 0.7588235294117647,
  'recall': 0.76,
  'f1-score': 0.7541666666666665,
  'support': 50.0}}