In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from bs4 import BeautifulSoup # text processing
import re
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
# Read labeled training and test data

train =pd.read_csv('./all/labeledTrainData.tsv', delimiter='\t', quoting=3)
test= pd.read_csv('./all/testData.tsv',delimiter='\t', quoting=3)
# Any results you write to the current directory are saved as output.

### 詞性的預處理

In [3]:
# lemmatize
lemma=WordNetLemmatizer()

In [4]:
# Defining a function to preprocess and clean data:
#'BeautifulSoup package' used to clean the data removing unwanted HTML.
#'Re package' used to remove unwanted punctuations. Few punctuations like '!', '?' and numeric numbers 
#are not removed as it may be helpful in predicting sementics. 
#'Tokenizer' used to convert paragraph into array instead of split(). This has improved performace as
#it can treat puctuations as separate word. Further steps include 'Stemming' and getting rid of 'stopwords'


def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review,"lxml").get_text()      # remove html
    letters = re.sub("[^a-zA-Z0-9!?'-]", " ", review_text)         # passing only alphabets, numbers and some few punctuations
    words_arr=[lemma.lemmatize(w) for w in word_tokenize(str(letters).lower())]   #Lammetize and tokenize
    stops = set(stopwords.words("english"))                                 
    meaningful_words = [w for w in words_arr if not w in stops]           #removing common english words
    return( " ".join( meaningful_words ))

In [5]:
## let's take one example and see the difference
train['review'][1]

'"\\"The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [7]:
# cleaned paragraph ex.
clean_review = review_to_words( train["review"][1] )
print(clean_review)

classic war world timothy hines entertaining film obviously go great effort length faithfully recreate h g well ' classic book mr hines succeeds watched film appreciated fact wa standard predictable hollywood fare come every year e g spielberg version tom cruise slightest resemblance book obviously everyone look different thing movie envision amateur critic look criticize everything others rate movie important base like entertained people never agree critic enjoyed effort mr hines put faithful h g well ' classic novel found entertaining made easy overlook critic perceive shortcoming


In [8]:
# we can proceed to process full training and test data
num_reviews = train["review"].size
clean_train_reviews = []

print ("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("Review %d of %d" % ( i+1, num_reviews ))                                                                   
    clean_train_reviews.append( review_to_words( train['review'][i] ))

Cleaning and parsing the training set movie reviews...

Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [9]:
# Cleaning and Parsing Test Data

numOfRev=len(test)
clean_test_reviews=[]
print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,numOfRev):
    if( (i+1) % 1000 == 0 ):
        print("Review %d of %d\t" % (i+1, numOfRev))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append(clean_review)


Cleaning and parsing the test set movie reviews...

Review 1000 of 25000	
Review 2000 of 25000	
Review 3000 of 25000	
Review 4000 of 25000	
Review 5000 of 25000	
Review 6000 of 25000	
Review 7000 of 25000	
Review 8000 of 25000	
Review 9000 of 25000	
Review 10000 of 25000	
Review 11000 of 25000	
Review 12000 of 25000	
Review 13000 of 25000	
Review 14000 of 25000	
Review 15000 of 25000	
Review 16000 of 25000	
Review 17000 of 25000	
Review 18000 of 25000	
Review 19000 of 25000	
Review 20000 of 25000	
Review 21000 of 25000	
Review 22000 of 25000	
Review 23000 of 25000	
Review 24000 of 25000	
Review 25000 of 25000	


### CountVectorizer轉換為矩陣

In [10]:
#creating bag of words

from sklearn.feature_extraction.text import CountVectorizer              #Importing Vectorizer
vectorizer= CountVectorizer(analyzer='word',max_features=2500)

train_data_features = vectorizer.fit_transform(clean_train_reviews)     #Vectorizing training Data
train_data_features = train_data_features.toarray() 

test_data_features = vectorizer.transform(clean_test_reviews)            #Vectorize Test Data
test_data_features = test_data_features.toarray()

### 使用TF-IDF與SVM

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(train_data_features)       #  TFIDF
messages_tfidf = tfidf_transformer.transform(train_data_features)
test_tfidf=tfidf_transformer.transform(test_data_features)

from sklearn.svm import SVC, LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(messages_tfidf, train['sentiment'])                  # SVM
pred = linear_svc.predict(test_tfidf)

In [12]:
# test the accuracy
acc_linear_svc = round(linear_svc.score(messages_tfidf, train['sentiment']) * 100, 2)
acc_linear_svc

91.59

In [13]:
final_result = pd.DataFrame( data={"id":test["id"], "sentiment":pred})
final_result.to_csv('output', index=False, quoting=3)