# Dataset: https://www.kaggle.com/nltkdata/movie-review

### Import the library

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

### Import Data

In [None]:
data = pd.read_csv('movie_review.csv')

In [None]:
data.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [None]:
X = data.iloc[:,-2]

In [None]:
y = data.iloc[:,-1]

In [None]:
X

0        films adapted from comic books have had plenty...
1        for starters , it was created by alan moore ( ...
2        to say moore and campbell thoroughly researche...
3        the book ( or " graphic novel , " if you will ...
4        in other words , don't dismiss this film becau...
                               ...                        
64715    that lack of inspiration can be traced back to...
64716    like too many of the skits on the current inca...
64717    after watching one of the " roxbury " skits on...
64718     bump unsuspecting women , and . . . that's all .
64719    after watching _a_night_at_the_roxbury_ , you'...
Name: text, Length: 64720, dtype: object

## CountVectorizer

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# corpus = ['This is the first document.',
#             'This document is the second document.',
#             'And this is the third one.',
#             'Is this the first document?']
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())
# print(X.toarray())

## Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

### TfidfVectorizer documentation
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

### Working With Text Data tutorial
https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# corpus = ['This is the first document.',
#             'This document is the second document.',
#             'And this is the third one.',
#             'Is this the first document?']

# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())

# print(X.toarray())

In [None]:
# By use sklearn pipeline we can use the actual data and then they are
# auto vectorize data to input to machine
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

In [None]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [None]:
y_pred = text_clf.predict(X_test)
accuracy_score(y_pred,y_test)

0.6983158220024722

In [None]:
y_pred = text_clf.predict(['Avoid this movie at any cost, just not good'])

In [None]:
y_pred

array(['neg'], dtype=object)