---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

*Note: Some of the cells in this notebook are computationally expensive. To reduce runtime, this notebook is using a subset of the data.*

# Recall some knowledge what I have learned

In [None]:
from sklearn import naive_bayes # import naive bayes from sklearn
clfrNB = naive_bayes.MultinomialNB() # creare a bayes classifier (we use multinomianl)
clfrNB.fit(train_data, train_labels) # train the classifier by using training data
predicted_labels = clfrNB.predict(test_data) # predict the test data
# check how well of the trained classifier by comparing actual label and predicted lable
# you can also check it by using other method such as f1_mean and so on
metrics.f1_score(test_labels, predicted_labels, average='micro') 

In [None]:
from sklearn import svm
# SVC is support vector classifier
# the default kernel is RBF, radial basis function
# the default C value is one
clfrSVM = svm.SVC(kernel=‘linear’, C=0.1) 
clfrSVM.fit(train_data, train_labels)
predicted_labels = clfrSVM.predict(test_data)

In [None]:
# model selection
from sklearn import model_selection
# method one: split data into two parts, in this example, one third data is test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data, train_labels, test_size = 0.333, random_state = 0)
# method two: use cross validation method, cv=5 means spilit data into five equal parts, train 5 timesfour 
# every time you trained, four parts in train data set, one part in test data set
# get average of five times
predicted_labels = model_selection.cross_val_predict(clfrSVM, train_data, train_labels, cv=5)

In [None]:
# Natural language training
from nltk.classify import NaiveBayesClassifier
# train the model
classifier = NaiveBayesClassifier.train(train_set)
# classify one unlabel instance
classifier.classify(unlabaled_instance)
# if you have many instance, use classify_many
classifier.classify_many(unlabeled_instances)
# get the accuracy of the performance
nltk.classify.util.accuracy(classifier, test_set)
# tells you all the labels in this classifier that has trained on
classifier.labels()
# give you top features,you can set 5 or 10 top features
classifier.show_most_informative_features()

In [None]:
# you can use sklearn from NLTK
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
clfrNB = SklearnClassifier(MultinomialNB()).train(train_set)
clfrSVM =
SklearnClassifier(SVC(),kernel=‘linear’).train(train_set)

# Case Study: Sentiment Analysis

### Data Prep

In [1]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [2]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3 (3 is thought neutral rating)
df = df[df['Rating'] != 3]

# Encode rating >3 (4 and 5) as 1 (rated positively)
# Encode rating <3 (1 and 2) as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0,1
406017,Verizon HTC Rezound 4G Android Smarphone - 8MP...,HTC,74.99,4,Great phone for the price...,0.0,1
302567,"RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...",RCA,159.99,5,My mom is not good with new technoloy but this...,4.0,1


In [3]:
# Most ratings are positive
df['Positively Rated'].mean()

0.74717766860786672

In [4]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets by using Reviews and Positively rated columns
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [5]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 Everything about it is awesome!


X_train shape:  (23052,)


# CountVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
# the fitting will find all characters of at least two letters or numbers and also convert everything to lowercase and build a vocabulary
vect = CountVectorizer().fit(X_train)

In [7]:
# get the vocabulary
vect.get_feature_names()[::2000]

['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']

In [8]:
# check the length of vocabulary
len(vect.get_feature_names())

19601

In [9]:
# transform the documents in the training data to a document-term matrix, giving us the bag-of-word representations of X_train
# this document is stored in sparse matrix 
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.linear_model import LogisticRegression

# Train the model by using X_train_vectorized
# LogisticRegression works well for high dimensional sparse data
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
# Note that any word in X_test that didn't in X_train will be just ingnored
predictions = model.predict(vect.transform(X_test))
# look the AUC score
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.897433277667


In [12]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'terrible' 'slow' 'junk' 'poor' 'sucks' 'horrible' 'useless'
 'waste' 'disappointed']

Largest Coefs: 
['excelent' 'excelente' 'excellent' 'perfectly' 'love' 'perfect' 'exactly'
 'great' 'best' 'awesome']


# Tfidf
#### Consider a corpus of documents and a dictionary of terms contain all the words that appear in the documents. The term-document matrix then is a two-dimensional matrix whose rows are the terms and columns are the documents, so each entry (i, j) represents the frequency of term i in document j.
#### One example of these matrices is term frequency—inverse document frequency (tf–idf), which is used in information retrieval. For each entry in the matrix, the term frequency measures the number of times that term i appears in document j, and the inverse document frequency measures the number of documents in the corpus which contain term i. The tf-idf score is the product of these two metrics (tf*idf). So an entry's tf-idf score increases when term i appears frequently in document j, but decreases as the term appears in other documents.
#### This tf-idf score is often used in rank documents in search results, ranking highly documents which contain with high frequency the terms in the search query, and especially those terms which are rare/unique words rather than common ones (e.g. "the").
##  tf–idf is zero if the word (for example 'the') appeared in all the documents, which implies that the word is not very informative as it appears in all documents. How to calculate td-idf can be seen in https://en.wikipedia.org/wiki/Tf–idf

In [13]:
# tfidf method allow us to rescale feature
# tfidf method allows us to weight terms based on how important they are to a documents
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
# any documents that less than 5 will be removed, which helps to reduce the number of features
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

5442

In [14]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

# we can see we get similar AUC but use much fewer features
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.889951006492


In [15]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
# look at the smallest and largest tfidf
print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
 'degrees' 'handsfree' 'chipset']

Largest tfidf: 
['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
 'exellent' 'satisfied']


In [16]:
sorted_coef_index = model.coef_[0].argsort()
# look at the smallest and largest coefficient
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome' 'far'
 'perfectly']


In [17]:
# These reviews are treated the same (both are negative) by our current model because word order is disregarded
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


# n-grams

In [18]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
# ngram can be poweful in capturing the meaning (such as it distinguish 'is issue' and 'not issue')
# but big ngram can cause explosion of the number of features
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

29072

In [19]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
# 2-gram method increase the AUC
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.91106617946


In [20]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()
# we can see 'not good', 'no problem' are captured correctly
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
 'defective' 'horrible']

Largest Coefs: 
['excellent' 'excelente' 'excelent' 'perfect' 'great' 'love' 'awesome'
 'no problems' 'good' 'best']


In [21]:
# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]
