# Importing various libraries

In [8]:
%matplotlib inline  
import pandas as pd  
import numpy as np  
from textblob import TextBlob  
import matplotlib as mpl  
import matplotlib.pyplot as plt  
import csv  
import _pickle as cPickle  
from scipy.io import loadmat  
from sklearn.svm import SVC  
import seaborn as sns  
sns.set_context('notebook')  
sns.set_style('white')  

import nltk  
from nltk.corpus import stopwords  
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer  
from sklearn.model_selection import train_test_split  # Ganti cross_validation  
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score  
from sklearn.pipeline import Pipeline  
from sklearn.model_selection import GridSearchCV  # Ganti grid_search  
from sklearn.model_selection import StratifiedKFold, cross_val_score  # Ganti cross_validation  
from sklearn.tree import DecisionTreeClassifier   
from sklearn.model_selection import learning_curve  # Ganti learning_curve  
from sklearn.naive_bayes import MultinomialNB

In [9]:
df=pd.read_csv("Training.txt",sep="\t", names=['liked','text'],encoding="utf-8");
df.head(3)

Unnamed: 0,liked,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.


This dataset is downloaded from https://www.kaggle.com/c/si650winter11/data
this is a TSV ("tab separated values") file, where the first column is a label saying whether the given review
is positive or negative. The second column is the review itself.
Data is tab separeted and therefore "\t" is passed as separator parameter to function.

In [10]:
print(len(df))

6918


Total no of reviews.

In [11]:
df.groupby('liked').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
liked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2975,559,I hate Harry Potter.,85
1,3943,732,I love Harry Potter.,167


# Data preprocessing

In [15]:
def tokens(review):
    return TextBlob(review).words

In [22]:
def advanced_tokens(text):  
    # Preprocessing lebih komprehensif  
    text = text.lower()  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Hapus karakter non-alfabet  
    
    tokens = word_tokenize(text)  
    tokens = [token for token in tokens if len(token) > 2]  # Hapus token pendek  
    
    return tokens

Function tokens() is created to parse data/review into words.

In [29]:
TextBlob("ready was not a good movie").tags
#nltk.help.upenn_tagset('JJ')

MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.


In [25]:
import nltk  
import ssl  

try:  
    _create_unverified_https_context = ssl._create_unverified_context  
except AttributeError:  
    pass  
else:  
    ssl._create_default_https_context = _create_unverified_https_context  

# Download semua data yang diperlukan  
nltk.download('punkt')  
nltk.download('stopwords')  
nltk.download('averaged_perceptron_tagger')  
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rafli\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\rafli\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

.tags is inbuilt function provided by TextBlob it is used to assign part of speech tags to the words in text.
It gives list of (word, POS) pairs.
To check meaning of particular tag nltk.help.upenn_tagset('tagname') can be used eg. nltk.help.upenn_tagset('JJ')

In [79]:
def to_lemmas(review):
    wordss = TextBlob(review.lower()).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in wordss]

df.text.head().apply(to_lemmas)

0                     [india, is, developing, country]
1      [the, da, vinci, code, book, is, just, awesome]
2    [this, wa, the, first, clive, cussler, i, 've,...
3             [i, liked, the, da, vinci, code, a, lot]
4             [i, liked, the, da, vinci, code, a, lot]
Name: text, dtype: object

Lemmatization is one of the important satge of data preprocessing in this step words are converted to their lemma(base form). For example "octopi" is converted to "octopus". similar method is stemming.

NLTK also provide very powerful lemmatizer which make use of WORDNET eg.

In [80]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('octopi')
#nltk

'octopus'

# Converting text data into vectors 

In [81]:
bow_transformer = CountVectorizer(analyzer=to_lemmas).fit(df['text'])
print(len(bow_transformer.vocabulary_))

2122


Sklearn offers Countvectorizer which counts the frequency of particular word in document. This assigns a unique number to every word in collection.

In [82]:
review1=df['text'][3]
print(review1)
#to check 3rd document/review in collection/database

i liked the Da Vinci Code a lot.


In [83]:
bow=bow_transformer.transform([review1])
print(bow)
bow.shape

  (0, 42)	1
  (0, 372)	1
  (0, 461)	1
  (0, 955)	1
  (0, 1127)	1
  (0, 1156)	1
  (0, 1844)	1
  (0, 1983)	1


(1, 2122)

Countvectorizer creates sparse matrix.

In [84]:
print(bow_transformer.get_feature_names()[372])
#to check 372nd word in collection

code


In [85]:
review_bow = bow_transformer.transform(df['text'])
print( 'sparse matrix shape:', review_bow.shape)
print('number of non-zeros:', review_bow.nnz) #learn this
print( 'sparsity: %.2f%%' % (100.0 * review_bow.nnz))

sparse matrix shape: (6931, 2122)
number of non-zeros: 71287
sparsity: 7128700.00%


shape of sparse matrix n*m where n are total documents and m are total unique words.

CountVectorizer just count the frequency of word in that document. But many time few words such as(the, or) occurs a lot of time in collection which really don't contribute in deciding the polarity of particular document so to nullify their effect special weighting method is to be used. 

# Tf-idf Vectorizer

Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.

The goal of using tf-idf instead of the just CountVectorizer in a given document is to scale down the impact of tokens that occur very frequently in a given corpus(which are less informative) than tokens which occur few times.

Tf-idf(d,t)=tf(d,t)*idf(d,t);

where tf(d,t) is term frequency which states how many times word/token t occur in that doucument devided by total no of words in that document.

and idf(d,t)=log[n/(df(d,t))] i.e. total no of documents divided by no of documents containing that word/token t.

In [89]:
tfidf_transformer =TfidfTransformer().fit(review_bow)
review_tfidf = tfidf_transformer.transform(review_bow)
review_tfidf.shape

(6931, 2122)

Count vectorizer gives output as frequency of diiferent words in our corpus this is then passed to transform method of tf-idf_transformer.

This Transform a count matrix to a normalized tf or tf-idf representation



In [90]:
text_train, text_test, liked_train, liked_test = train_test_split(df['text'], df['liked'], test_size=0.2)
print(len(text_train), len(text_test), len(text_train) , len(text_test))


5544 1387 5544 1387


The dataset downloaded is then divided into training data and test data with ratio 0.8 to 0.2

In [91]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer=to_lemmas)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC()),
])

Pipeline sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit.

The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.

In [92]:
# pipeline parameters to automatically explore and tune
param_svm = [
  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]


SVM

In [93]:
grid_svm = GridSearchCV(
    pipeline_svm, #object used to fit the data
    param_grid=param_svm, 
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores" i.e. to run on all CPUs
    scoring='accuracy',#optimizing parameter
    cv=StratifiedKFold(liked_train, n_folds=5),
)

Exhaustive search over specified parameter values for an estimator.

CV stands for cross validations. Learning the parameters of a prediction function and testing it on the same data is a methodological mistake, it will always give 100% accuracy and therefore training and testing data must be different. Cross validation is idea of dividing training data into k folds i.e. k subset. The following procedure is followed for each of the k “folds”:

A model is trained using k-1 of the folds as training data;

The resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).

  

In [94]:
%time classifier = grid_svm.fit(text_train, liked_train) # find the best combination from param_svm
print(classifier.grid_scores_)

CPU times: user 2.11 s, sys: 46.9 ms, total: 2.16 s
Wall time: 1min 7s
[mean: 0.99170, std: 0.00367, params: {'classifier__C': 1, 'classifier__kernel': 'linear'}, mean: 0.99170, std: 0.00367, params: {'classifier__C': 10, 'classifier__kernel': 'linear'}, mean: 0.99170, std: 0.00367, params: {'classifier__C': 100, 'classifier__kernel': 'linear'}, mean: 0.99170, std: 0.00367, params: {'classifier__C': 1000, 'classifier__kernel': 'linear'}, mean: 0.56385, std: 0.00023, params: {'classifier__C': 1, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.56385, std: 0.00023, params: {'classifier__C': 1, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.97385, std: 0.00514, params: {'classifier__C': 10, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}, mean: 0.56385, std: 0.00023, params: {'classifier__C': 10, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}, mean: 0.98900, std: 0.00293, params: {'classifier__C': 100, 'classifier__gamma': 0.001, 'c

In [95]:
print(classification_report(liked_test, classifier.predict(text_test)))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       557
          1       0.99      0.99      0.99       830

avg / total       0.99      0.99      0.99      1387



In [96]:
print(classifier.predict(["the vinci code is awesome"])[0])

1


In [97]:
print(classifier.predict(["the vinci code is bad"])[0])

0


In [98]:
def gaussKernel(x1, x2, sigma):
    ss=np.power(sigma,2)
    norm= (x1-x2).T.dot(x1-x2)
    return np.exp(-norm/(2*ss))
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
gaussKernel(x1,x2,sigma)

0.32465246735834974