In [1]:
# PRELIMINARIES

In [2]:
'''

link: https://www.kaggle.com/sid321axn/amazon-alexa-reviews/home

This dataset consists of a nearly 3000 Amazon customer reviews 
(input text), star ratings, date of review, variant and feedback
of various amazon Alexa products like Alexa Echo, Echo dots, 
Alexa Firesticks etc. for learning how to train Machine for 
sentiment analysis.

You can use this data to analyze Amazon’s Alexa product;
discover insights into consumer reviews and assist with machine learning
models.You can also train your machine models for sentiment analysis and
analyze customer reviews how many positive reviews ? 
and how many negative reviews ?

Extracted from Amazon's website
'''

"\n\nlink: https://www.kaggle.com/sid321axn/amazon-alexa-reviews/home\n\nThis dataset consists of a nearly 3000 Amazon customer reviews \n(input text), star ratings, date of review, variant and feedback\nof various amazon Alexa products like Alexa Echo, Echo dots, \nAlexa Firesticks etc. for learning how to train Machine for \nsentiment analysis.\n\nYou can use this data to analyze Amazon’s Alexa product;\ndiscover insights into consumer reviews and assist with machine learning\nmodels.You can also train your machine models for sentiment analysis and\nanalyze customer reviews how many positive reviews ? \nand how many negative reviews ?\n\nExtracted from Amazon's website\n"

In [3]:
import pandas as pd, numpy as np
PATH = "../input/amazon_alexa.tsv"
raw_data = pd.read_csv(PATH, sep = '\t')

In [4]:
pd.set_option('display.max_colwidth', -1)
raw_data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",1
3,5,31-Jul-18,Charcoal Fabric,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [5]:
raw_data.rating.value_counts()

5    2286
4    455 
1    161 
3    152 
2    96  
Name: rating, dtype: int64

In [6]:
# Remove Rating = 3 categories as they are likely to confuse the model. 
raw_data['response'] = 0
raw_data.loc[raw_data['rating'].isin([4,5]), 'response'] = 1
raw_data.loc[raw_data['rating'] == 3, 'response'] = 2
bad_reviews = raw_data[raw_data['response'] == 0]
good_reviews = raw_data[raw_data['response'] == 1]
all_reviews = good_reviews.append(bad_reviews)

In [7]:
data = all_reviews[['response', 'verified_reviews']]
data.columns = ['response', 'text']

In [8]:
# EXPLORATORY DATA ANALYSIS

In [9]:
pd.set_option('display.max_colwidth', -1)
data.head()

Unnamed: 0,response,text
0,1,Love my Echo!
1,1,Loved it!
2,1,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home."
3,1,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well."
4,1,Music


In [10]:
data.shape

(2998, 2)

In [11]:
# Event Rate
data.response.value_counts()

1    2741
0    257 
Name: response, dtype: int64

In [12]:
# Check for Nulls
data.isnull().sum()

response    0
text        0
dtype: int64

In [13]:
# Check Data Types
data['text'].astype('str')
data.dtypes

response    int64 
text        object
dtype: object

In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
# SYNTACTICAL FEATURES (PHYSICAL DESCRIPTIONS)

In [16]:
# 1. Size Measurements

# no. of characters (also text size)
data['char_cnt'] = data['text'].str.len()

# no. of words
data['word_cnt'] = data['text'].apply(lambda x: len(str(x).split()))

# no. of sentences
data['sentence_cnt'] = data['text'].apply(lambda x: len(str(x).split(". ")))

data[['text', 'char_cnt', 'word_cnt', 'sentence_cnt']].head()

Unnamed: 0,text,char_cnt,word_cnt,sentence_cnt
0,Love my Echo!,13,3,1
1,Loved it!,9,2,1
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",195,38,2
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",172,34,3
4,Music,5,1,1


In [17]:
# 2. Derived Ratios

# avg. word size
data['avg_word_size'] = data['char_cnt']/data['word_cnt']

# avg. sentence size
data['avg_char_per_sent'] = data['char_cnt']/data['sentence_cnt']

# avg. words per sentence
data['avg_word_per_sent'] = data['word_cnt']/data['sentence_cnt']

data[['text','avg_word_size','avg_char_per_sent', 'avg_word_per_sent']].head()

Unnamed: 0,text,avg_word_size,avg_char_per_sent,avg_word_per_sent
0,Love my Echo!,4.333333,13.0,3.0
1,Loved it!,4.5,9.0,2.0
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",5.131579,97.5,19.0
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",5.058824,57.333333,11.333333
4,Music,5.0,5.0,1.0


In [18]:
# 3. Stopwords/filler Words

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# no. of stopwords
data['stop_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x in stop]))

# no. of stopwords in every sentence
data['avg_stop_per_sent'] = data['stop_cnt']/data['sentence_cnt']

# no. of stopwords to total words
data['avg_stop_per_word'] = data['stop_cnt']/data['word_cnt']

data[['text','stop_cnt', 'avg_stop_per_sent', 'avg_stop_per_word']].head()

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


Unnamed: 0,text,stop_cnt,avg_stop_per_sent,avg_stop_per_word
0,Love my Echo!,1,1.0,0.333333
1,Loved it!,0,0.0,0.0
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",19,9.5,0.5
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",12,4.0,0.352941
4,Music,0,0.0,0.0


In [19]:
# 4. Counts of Key Characters/Words

# no. of hashtags
data['hash_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))

# no. of @tags
data['tag_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))

# no. of exclamations!
data['excl_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.endswith('!')]))

# no. of questions? 
data['ques_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.endswith('?')]))

# no. of numeric chars
data['num_cnt'] = data['text'].apply(lambda x: sum(i.isdigit() for i in x))

# no. of uppercase words (SHOUTING?)
data['upper_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))

data[['text','hash_cnt', 'tag_cnt', 'excl_cnt','ques_cnt', 'num_cnt', 'upper_cnt']].head()

Unnamed: 0,text,hash_cnt,tag_cnt,excl_cnt,ques_cnt,num_cnt,upper_cnt
0,Love my Echo!,0,0,1,0,0,0
1,Loved it!,0,0,1,0,0,0
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",0,0,0,0,0,1
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",0,0,0,0,1,1
4,Music,0,0,0,0,0,0


In [20]:
# 5. Counts of Parts of Speech (POS Counts)

# tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
data['word_tokens'] = data['text'].apply(word_tokenize)

# number of nouns
data['noun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("NN", "NNS", "NNP", "NNPS")]))
data['proper_noun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("NNP", "NNPS")]))

# number of pronouns
data['pronoun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("PRP", "PRP$")]))
data['wh_pronoun_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("WP", "WP$")]))
data['pronoun_tot_cnt'] = data['pronoun_cnt'] + data['wh_pronoun_cnt']

# number of adjectives
data['adj_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("JJ","JJR", "JJS")]))

# number of verbs
data['verb_past_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("VBP", "VBZ","VBG")]))
data['verb_present_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("VBD", "VBN")]))
data['verb_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("VB")]))
data['verb_tot_cnt'] =  data['verb_past_cnt'] + data['verb_present_cnt'] + data['verb_cnt']

# number of adverbs
data['adverb_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("RB", "RBR", "RBS")]))

# number of modals
data['modal_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("MD")]))

# number of foreign words
data['foreign_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("FW")]))

# number of determiners
data['det_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("DET")]))

# number of conjunctions
data['cc_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("CC")]))
data['in_cnt'] = data['word_tokens'].apply(lambda x: len([i for i in pos_tag(x) if i[1] in ("IN")]))
data['ccin_cnt'] = data['cc_cnt'] + data['in_cnt']

data[['text','noun_cnt', 'proper_noun_cnt', 
                'pronoun_cnt','wh_pronoun_cnt','pronoun_tot_cnt',
                'adj_cnt', 'adverb_cnt','foreign_cnt','det_cnt', 'modal_cnt',
                'verb_cnt', 'verb_past_cnt', 'verb_present_cnt','verb_tot_cnt'
                ,'cc_cnt', 'in_cnt', 'ccin_cnt']].head()

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno -3] Temporary failure in name resolution>


Unnamed: 0,text,noun_cnt,proper_noun_cnt,pronoun_cnt,wh_pronoun_cnt,pronoun_tot_cnt,adj_cnt,adverb_cnt,foreign_cnt,det_cnt,modal_cnt,verb_cnt,verb_past_cnt,verb_present_cnt,verb_tot_cnt,cc_cnt,in_cnt,ccin_cnt
0,Love my Echo!,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0
1,Loved it!,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",5,1,5,0,5,3,3,0,0,1,2,4,1,7,3,7,10
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",12,1,2,0,2,2,3,0,0,0,1,3,1,5,1,4,5
4,Music,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# Prepare the Data

features = ['response','char_cnt', 'word_cnt', 'sentence_cnt', 'avg_word_size','avg_char_per_sent',
                  'avg_word_per_sent', 'stop_cnt', 'avg_stop_per_sent', 'avg_stop_per_word',
                  'hash_cnt', 'tag_cnt', 'excl_cnt','ques_cnt', 'num_cnt', 'upper_cnt',
           'noun_cnt', 'proper_noun_cnt', 
                'pronoun_cnt','wh_pronoun_cnt','pronoun_tot_cnt',
                'adj_cnt', 'adverb_cnt','foreign_cnt','det_cnt', 'modal_cnt',
                'verb_cnt', 'verb_past_cnt', 'verb_present_cnt','verb_tot_cnt'
                ,'cc_cnt', 'in_cnt', 'ccin_cnt']
data_temp = data[features]
data_temp = data_temp.dropna()
data_temp.head()

Unnamed: 0,response,char_cnt,word_cnt,sentence_cnt,avg_word_size,avg_char_per_sent,avg_word_per_sent,stop_cnt,avg_stop_per_sent,avg_stop_per_word,hash_cnt,tag_cnt,excl_cnt,ques_cnt,num_cnt,upper_cnt,noun_cnt,proper_noun_cnt,pronoun_cnt,wh_pronoun_cnt,pronoun_tot_cnt,adj_cnt,adverb_cnt,foreign_cnt,det_cnt,modal_cnt,verb_cnt,verb_past_cnt,verb_present_cnt,verb_tot_cnt,cc_cnt,in_cnt,ccin_cnt
0,1,13,3,1,4.333333,13.0,3.0,1,1.0,0.333333,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0
1,1,9,2,1,4.5,9.0,2.0,0,0.0,0.0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0
2,1,195,38,2,5.131579,97.5,19.0,19,9.5,0.5,0,0,0,0,0,1,5,1,5,0,5,3,3,0,0,1,2,4,1,7,3,7,10
3,1,172,34,3,5.058824,57.333333,11.333333,12,4.0,0.352941,0,0,0,0,1,1,12,1,2,0,2,2,3,0,0,0,1,3,1,5,1,4,5
4,1,5,1,1,5.0,5.0,1.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [22]:
data_temp.shape

(2931, 33)

In [23]:
# Train Test Split

X = data_temp.drop('response', axis = 1)
y = data_temp['response'].astype('int')

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=19)

In [24]:
!pip install catboost



In [25]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(X_train, y_train, cat_features = np.where(X.dtypes == 'object')[0])
test_pool = Pool(X_test, y_test, cat_features = np.where(X.dtypes == 'object')[0])
model = CatBoostClassifier(random_state = 1, eval_metric='AUC', use_best_model = True, verbose = 200)
model.fit(train_pool, eval_set = test_pool)

Learning rate set to 0.084141
0:	test: 0.5352481	best: 0.5352481 (0)	total: 62.1ms	remaining: 1m 2s
200:	test: 0.7567008	best: 0.7581606 (198)	total: 2.47s	remaining: 9.84s
400:	test: 0.7483322	best: 0.7615725 (297)	total: 5.1s	remaining: 7.62s
600:	test: 0.7474495	best: 0.7615725 (297)	total: 7.75s	remaining: 5.15s
800:	test: 0.7406936	best: 0.7615725 (297)	total: 10.4s	remaining: 2.58s
999:	test: 0.7442583	best: 0.7615725 (297)	total: 13s	remaining: 0us

bestTest = 0.7615725416
bestIteration = 297

Shrink model to first 298 iterations.


<catboost.core.CatBoostClassifier at 0x7f2439018be0>

In [26]:
#CAT FEATURE IMPORTANCE

feature_importance = model.get_feature_importance(train_pool)
feature_names = X_train.columns
feature_imp = pd.DataFrame([feature_names, feature_importance])
final = feature_imp.transpose()
final.sort_values(by = 1, ascending = False, inplace = True)
pd.set_option('display.max_colwidth', -1)
final.head(10)

Unnamed: 0,0,1
11,excl_cnt,13.1978
0,char_cnt,6.38445
3,avg_word_size,6.23426
28,verb_tot_cnt,6.12428
21,adverb_cnt,5.95411
8,avg_stop_per_word,5.25004
4,avg_char_per_sent,5.22336
25,verb_cnt,4.63917
7,avg_stop_per_sent,4.25478
27,verb_present_cnt,4.0184


In [27]:
# RESULTS
probs = model.predict_proba(test_pool)
pred = np.where(probs[:,1] > 0.9, 1, 0)

print('Predicted Class and Probabilities: \n')
print(pred[:5]) # predicted class
print(probs[:5]) # probability scores

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
print('\nAccuracy: ', str(accuracy_score(y_test, pred)))
print('Precision: ', str(precision_score(y_test, pred)))
print('Recall: ', str(recall_score(y_test, pred)))
print('F1: ', str(f1_score(y_test, pred)))
print('Area under ROC Curve: ', str(roc_auc_score(y_test, probs[:,1])))
print('GINI: ', str(-1 + 2*roc_auc_score(y_test, probs[:,1])))

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

print('\nTrue Negatives: ', str(tn))
print('True Positives: ', str(tp))
print('False Negatives: ', str(fn))
print('False Positives: ', str(fp))

print('\nTotal Reviews: ', str(tn+fp+fn+tp))
print('Reviews Predicted as Negative: ', str(fn+tn))
print('Total Negative Reviews in Actuality: ', str(fp+tn))
print('Negative Reviews that were Correctly Predicted: ', str(tn))

print("NOT GOOD ENOUGH! HIGH TRADEOFF INVOLVED IN CATCHING NEGATIVE REVIEWS")

Predicted Class and Probabilities: 

[1 1 1 1 1]
[[0.00231324 0.99768676]
 [0.03048511 0.96951489]
 [0.04260398 0.95739602]
 [0.07536381 0.92463619]
 [0.06293744 0.93706256]]

Accuracy:  0.8102272727272727
Precision:  0.9507042253521126
Recall:  0.8364312267657993
F1:  0.8899143045484509
Area under ROC Curve:  0.7615725416305952
GINI:  0.5231450832611904

True Negatives:  38
True Positives:  675
False Negatives:  132
False Positives:  35

Total Reviews:  880
Reviews Predicted as Negative:  170
Total Negative Reviews in Actuality:  73
Negative Reviews that were Correctly Predicted:  38
NOT GOOD ENOUGH! HIGH TRADEOFF INVOLVED IN CATCHING NEGATIVE REVIEWS


In [28]:
# PRE PROCESSING FOR ADVANCED FEATURES

In [29]:
# lowercase all
data['text_clean_v1'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data[['text', 'text_clean_v1']].head()

Unnamed: 0,text,text_clean_v1
0,Love my Echo!,love my echo!
1,Loved it!,loved it!
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.","sometimes while playing a game, you can answer a question correctly but alexa says you got it wrong and answers the same as you. i like being able to turn lights on and off while away from home."
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.","i have had a lot of fun with this thing. my 4 yr old learns about dinosaurs, i control the lights and play games like categories. has nice sound when playing music as well."
4,Music,music


In [30]:
# remove punctuation
data['text_clean_v2'] = data['text_clean_v1'].str.replace('[^\w\s]','')
data[['text', 'text_clean_v1', 'text_clean_v2']].head()

Unnamed: 0,text,text_clean_v1,text_clean_v2
0,Love my Echo!,love my echo!,love my echo
1,Loved it!,loved it!,loved it
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.","sometimes while playing a game, you can answer a question correctly but alexa says you got it wrong and answers the same as you. i like being able to turn lights on and off while away from home.",sometimes while playing a game you can answer a question correctly but alexa says you got it wrong and answers the same as you i like being able to turn lights on and off while away from home
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.","i have had a lot of fun with this thing. my 4 yr old learns about dinosaurs, i control the lights and play games like categories. has nice sound when playing music as well.",i have had a lot of fun with this thing my 4 yr old learns about dinosaurs i control the lights and play games like categories has nice sound when playing music as well
4,Music,music,music


In [31]:
# remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['text_clean_v3'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data[['text','text_clean_v3']].head()

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


Unnamed: 0,text,text_clean_v3
0,Love my Echo!,Love Echo!
1,Loved it!,Loved it!
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.","Sometimes playing game, answer question correctly Alexa says got wrong answers you. I like able turn lights away home."
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.","I lot fun thing. My 4 yr old learns dinosaurs, control lights play games like categories. Has nice sound playing music well."
4,Music,Music


In [32]:
'''
# correct spelling (takes time)
from textblob import TextBlob
data['text_clean_v4'] = data['text_clean_v3'].apply(lambda x: str(TextBlob(x).correct()))
data[['text_clean_v3', 'text_clean_v4']].head()
'''

"\n# correct spelling (takes time)\nfrom textblob import TextBlob\ndata['text_clean_v4'] = data['text_clean_v3'].apply(lambda x: str(TextBlob(x).correct()))\ndata[['text_clean_v3', 'text_clean_v4']].head()\n"

In [33]:
# remove rare words
rare_words = pd.Series(' '.join(data['text_clean_v3']).split()).value_counts()[-2500:]
rare_words.head() 
rare_words_list = list(rare_words.index)
data['text_clean_v5'] = data['text_clean_v3'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_words_list))

In [34]:
# stemming
#import nltk
nltk.download('stem')
from nltk.stem import PorterStemmer
from textblob import Word
data['text_clean_v6'] = data['text_clean_v5'].apply(lambda x: " ".join([PorterStemmer().stem(word) for word in x.split()]))
data[['text', 'text_clean_v6']].head()

[nltk_data] Error loading stem: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


Unnamed: 0,text,text_clean_v6
0,Love my Echo!,love echo!
1,Loved it!,love it!
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",sometim play answer question correctli alexa say got wrong answer you. I like abl turn light away home.
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.","I lot fun thing. My 4 yr old learn dinosaurs, control light play game like categories. ha nice sound play music well."
4,Music,music


In [35]:
# lemmatization
import nltk
nltk.download('wordnet')
from textblob import Word
data['text_clean_v6'] = data['text_clean_v3'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data[['text', 'text_clean_v6']].head()

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


Unnamed: 0,text,text_clean_v6
0,Love my Echo!,Love Echo!
1,Loved it!,Loved it!
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.","Sometimes playing game, answer question correctly Alexa say got wrong answer you. I like able turn light away home."
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.","I lot fun thing. My 4 yr old learns dinosaurs, control light play game like categories. Has nice sound playing music well."
4,Music,Music


In [36]:
# to check the effects of text processing just undo it and proceed, then compare result
# TLDR: it doesn't help much!
data['text_clean_v6'] = data['text']

In [37]:
# ADVANCED FEATURE ENGINEERING

In [38]:
# Physical Features

# no. of characters (also text size)
data['char_cnt'] = data['text_clean_v6'].str.len()

# no. of words
data['word_cnt'] = data['text_clean_v6'].apply(lambda x: len(str(x).split()))

# no. of sentences
data['sentence_cnt'] = data['text_clean_v6'].apply(lambda x: len(str(x).split(". ")))

# avg. word size
data['avg_word_size'] = data['char_cnt']/data['word_cnt']

# avg. sentence size
data['avg_char_per_sent'] = data['char_cnt']/data['sentence_cnt']

# avg. words per sentence
data['avg_word_per_sent'] = data['word_cnt']/data['sentence_cnt']

In [39]:
# TERM FREQUENCY/WORD COUNTS

In [40]:
# find common words
common_words = pd.Series(' '.join(data['text_clean_v6']).split()).value_counts()[0:100]
common_words.head(10)

I       2637
the     2629
to      2554
and     2069
it      1548
a       1323
my      1199
is      1078
for     937 
with    717 
dtype: int64

In [41]:
# Counts of Negative/Positive Words

negative_words = ['bad', 'horrible', 'sad','wrong','no','worst','worse',
                  'not', 'terrible', 'okay', 'sorrow', 'kill', 'negative', 'empty',
                 'hollow', 'poor', 'upset', 'why', 'unfair', 'eliminate','not',
                  'never', 'hate', 'dislike'] 

positive_words = ['good', 'great', 'awesome', 'happy', 'joy', 'enjoy', 'use', 'useful',
                  'wonder', 'wonderful', 'love', 'like', 'amazing',
                  'thanks', 'thank','hey', 'haha','nice', 'cool', 'lol',
                 'right', 'yeah', 'fun', 'well', 'enjoyable', 'crazy', 'super', 'kickass']

data['neg_word_cnt'] = data['text_clean_v6'].apply(lambda x: len([x for x in x.split() if x.lower() in negative_words]))
data['pos_word_cnt'] = data['text_clean_v6'].apply(lambda x: len([x for x in x.split() if x.lower() in positive_words]))

# derived ratios
data['neg_word_cnt_ratio1'] = data['neg_word_cnt']/data['word_cnt']
data['neg_word_cnt_ratio2'] = data['neg_word_cnt']/data['sentence_cnt']
data['pos_word_cnt_ratio1'] = data['pos_word_cnt']/data['word_cnt']
data['pos_word_cnt_ratio2'] = data['pos_word_cnt']/data['sentence_cnt']

new_features_2 = ['neg_word_cnt', 'pos_word_cnt','neg_word_cnt_ratio1','neg_word_cnt_ratio2','pos_word_cnt_ratio1','pos_word_cnt_ratio2']

data[['text_clean_v6'] + new_features_2].head()

Unnamed: 0,text_clean_v6,neg_word_cnt,pos_word_cnt,neg_word_cnt_ratio1,neg_word_cnt_ratio2,pos_word_cnt_ratio1,pos_word_cnt_ratio2
0,Love my Echo!,0,1,0.0,0.0,0.333333,1.0
1,Loved it!,0,0,0.0,0.0,0.0,0.0
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",1,1,0.026316,0.5,0.026316,0.5
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",0,3,0.0,0.0,0.088235,1.0
4,Music,0,0,0.0,0.0,0.0,0.0


In [42]:
# Count Vectors (Single Word)

from sklearn.feature_extraction.text import CountVectorizer
num_features = 50
vectorizer = CountVectorizer(ngram_range=(1,1), 
                            max_features = num_features,
                            max_df=1.0, min_df=0.0)
count_vectors = vectorizer.fit_transform(list(data['text_clean_v6']))


# reshape to pandas
from scipy import sparse
count_vectors_pd = pd.DataFrame(count_vectors.todense())
count_vectors_pd.columns = vectorizer.get_feature_names()
count_vector_features = vectorizer.get_feature_names()
data = pd.concat([data.reset_index(drop=True),count_vectors_pd.reset_index(drop=True)], axis=1)
  

In [43]:
# Count Vectors (2-Gram)

from sklearn.feature_extraction.text import CountVectorizer
num_features = 25
vectorizer = CountVectorizer(ngram_range=(2,2), 
                            max_features = num_features,
                            max_df=1.0, min_df=0.0)
count_vectors = vectorizer.fit_transform(list(data['text_clean_v6']))

# reshape to pandas
from scipy import sparse
count_vectors_pd = pd.DataFrame(count_vectors.todense())
count_vectors_pd.columns = vectorizer.get_feature_names()
count_vector_2gram_features = vectorizer.get_feature_names()
data = pd.concat([data.reset_index(drop=True),count_vectors_pd.reset_index(drop=True)], axis=1)
  

In [44]:
# TFIDF Vectors

from sklearn.feature_extraction.text import TfidfVectorizer
num_features = 50
vectorizer = TfidfVectorizer(ngram_range=(1,1), 
                            max_features = num_features,
                            max_df=1.0, min_df=0.0)
count_vectors = vectorizer.fit_transform(list(data['text_clean_v6']))


# reshape to pandas
from scipy import sparse
count_vectors_pd = pd.DataFrame(count_vectors.todense())
count_vectors_pd.columns = vectorizer.get_feature_names()
tfidf_features = vectorizer.get_feature_names()
data = pd.concat([data.reset_index(drop=True),count_vectors_pd.reset_index(drop=True)], axis=1)
  

In [45]:
#!pip install afinn

In [46]:
'''
from afinn import Afinn

def Afinn_apply(var):
    afinn = Afinn(emoticons=True)
    return afinn.score(var)

data['Afinn'] = data['text_clean_v6'].apply(Afinn_apply)
data[['text_clean_v6', 'Afinn']].head()
'''

"\nfrom afinn import Afinn\n\ndef Afinn_apply(var):\n    afinn = Afinn(emoticons=True)\n    return afinn.score(var)\n\ndata['Afinn'] = data['text_clean_v6'].apply(Afinn_apply)\ndata[['text_clean_v6', 'Afinn']].head()\n"

In [47]:
 from textblob import TextBlob

def TextBlobPolarity(var):
  testimonial = TextBlob(var)
  return testimonial.sentiment.polarity

def TextBlobSubjectivity(var):
  testimonial = TextBlob(var)
  return testimonial.sentiment.subjectivity

data['TextBlobSubjectivity'] = data['text_clean_v6'].apply(TextBlobSubjectivity)
data['TextBlobPolarity'] = data['text_clean_v6'].apply(TextBlobPolarity)
data['TextBlobSubPol_Interaction'] = data['TextBlobSubjectivity']*data['TextBlobPolarity']
data[['text_clean_v6', 'TextBlobPolarity', 'TextBlobSubjectivity']].head()

Unnamed: 0,text_clean_v6,TextBlobPolarity,TextBlobSubjectivity
0,Love my Echo!,0.625,0.6
1,Loved it!,0.875,0.8
2,"Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you. I like being able to turn lights on and off while away from home.",-0.1,0.5125
3,"I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.",0.35,0.45
4,Music,0.0,0.0


In [48]:
# Prepare the Data

data_temp = data[['response','char_cnt', 'word_cnt', 'sentence_cnt', 'avg_word_size','avg_char_per_sent',
                  'avg_word_per_sent','TextBlobSubPol_Interaction',
                 'TextBlobPolarity', 'TextBlobSubjectivity', 
                  'noun_cnt', 'proper_noun_cnt', 
                'pronoun_cnt','wh_pronoun_cnt','pronoun_tot_cnt',
                'adj_cnt', 'adverb_cnt','foreign_cnt','det_cnt', 'modal_cnt',
                'verb_cnt', 'verb_past_cnt', 'verb_present_cnt','verb_tot_cnt'
                ,'cc_cnt', 'in_cnt', 'ccin_cnt'] 
                 + count_vector_features
                 + tfidf_features
                 + count_vector_2gram_features
                 + new_features_2]
data_temp = data_temp.dropna()
data_temp.drop_duplicates(keep = 'first',inplace = True)
data_temp.head()

Unnamed: 0,response,char_cnt,word_cnt,sentence_cnt,avg_word_size,avg_char_per_sent,avg_word_per_sent,TextBlobSubPol_Interaction,TextBlobPolarity,TextBlobSubjectivity,noun_cnt,proper_noun_cnt,pronoun_cnt,wh_pronoun_cnt,pronoun_tot_cnt,adj_cnt,adverb_cnt,foreign_cnt,det_cnt,modal_cnt,verb_cnt,verb_past_cnt,verb_present_cnt,verb_tot_cnt,cc_cnt,in_cnt,ccin_cnt,alexa,alexa.1,all,all.1,amazon,amazon.1,an,an.1,and,and.1,are,are.1,as,...,we,when,when.1,with,with.1,works,works.1,you,you.1,all the,and it,easy to,echo dot,for my,for the,in the,it is,it to,it was,love it,love the,my echo,of the,set up,so far,the echo,this is,to my,to set,to use,use it,with the,works great,you can,neg_word_cnt,pos_word_cnt,neg_word_cnt_ratio1,neg_word_cnt_ratio2,pos_word_cnt_ratio1,pos_word_cnt_ratio2
0,1,13,3,1,4.333333,13.0,3.0,0.375,0.625,0.6,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.333333,1.0
1,1,9,2,1,4.5,9.0,2.0,0.7,0.875,0.8,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,1,195,38,2,5.131579,97.5,19.0,-0.05125,-0.1,0.5125,5,1,5,0,5,3,3,0,0,1,2,4,1,7,3,7,10,1,0.217605,0,0.0,0,0.0,0,0.0,2,0.287917,0,0.0,1,...,0.0,0,0.0,0,0.0,0,0.0,3,0.746085,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0.026316,0.5,0.026316,0.5
3,1,172,34,3,5.058824,57.333333,11.333333,0.1575,0.35,0.45,12,1,2,0,2,2,3,0,0,0,1,3,1,5,1,4,5,0,0.0,0,0.0,0,0.0,0,0.0,1,0.197581,0,0.0,1,...,0.0,1,0.373482,1,0.278209,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0.0,0.0,0.088235,1.0
4,1,5,1,1,5.0,5.0,1.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [49]:
# Train Test Split

X = data_temp.drop('response', axis = 1)
y = data_temp['response'].astype('int')

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=20)

In [50]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(X_train, y_train, cat_features = np.where(X.dtypes == 'object')[0])
test_pool = Pool(X_test, y_test, cat_features = np.where(X.dtypes == 'object')[0])
model = CatBoostClassifier(random_state = 1, eval_metric='AUC', use_best_model = True, verbose = 200, class_weights = [1,2])
model.fit(train_pool, eval_set = test_pool)

Learning rate set to 0.081569
0:	test: 0.6613855	best: 0.6613855 (0)	total: 78ms	remaining: 1m 17s
200:	test: 0.8912582	best: 0.8936245 (71)	total: 10s	remaining: 39.8s
400:	test: 0.8940597	best: 0.8962901 (269)	total: 19.9s	remaining: 29.8s
600:	test: 0.8928902	best: 0.8962901 (269)	total: 29.9s	remaining: 19.9s
800:	test: 0.8920198	best: 0.8962901 (269)	total: 39.9s	remaining: 9.92s
999:	test: 0.8932710	best: 0.8962901 (269)	total: 49.8s	remaining: 0us

bestTest = 0.8962900506
bestIteration = 269

Shrink model to first 270 iterations.


<catboost.core.CatBoostClassifier at 0x7f2471f31fd0>

In [51]:
#CAT FEATURE IMPORTANCE

feature_importance = model.get_feature_importance(train_pool)
feature_names = X_train.columns
feature_imp = pd.DataFrame([feature_names, feature_importance])
final = feature_imp.transpose()
final.sort_values(by = 1, ascending = False, inplace = True)
pd.set_option('display.max_colwidth', -1)
final.head(10)

Unnamed: 0,0,1
6,TextBlobSubPol_Interaction,15.0143
7,TextBlobPolarity,7.78906
255,pos_word_cnt_ratio1,5.04876
254,neg_word_cnt_ratio2,3.30832
252,pos_word_cnt,3.2515
253,neg_word_cnt_ratio1,3.06163
8,TextBlobSubjectivity,2.25825
172,love,2.17374
251,neg_word_cnt,1.92211
73,love,1.7192


In [52]:
# RESULTS
probs = model.predict_proba(test_pool)
pred = np.where(probs[:,1] > 0.90, 1, 0)

print('Predicted Class and Probabilities: \n')
print(pred[:5]) # predicted class
print(probs[:5]) # probability scores

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
print('\nAccuracy: ', str(accuracy_score(y_test, pred)))
print('Precision: ', str(precision_score(y_test, pred)))
print('Recall: ', str(recall_score(y_test, pred)))
print('F1: ', str(f1_score(y_test, pred)))
print('Area under ROC Curve: ', str(roc_auc_score(y_test, probs[:,1])))
print('GINI: ', str(-1 + 2*roc_auc_score(y_test, probs[:,1])))

tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

print('\nTrue Negatives: ', str(tn))
print('True Positives: ', str(tp))
print('False Negatives: ', str(fn))
print('False Positives: ', str(fp))

print('\nTotal Reviews: ', str(tn+fp+fn+tp))
print('Reviews Predicted as Negative: ', str(fn+tn))
print('Total Negative Reviews in Actuality: ', str(fp+tn))
print('Negative Reviews that were Correctly Predicted: ', str(tn))

print('''
THIS IS DECENT! 
WE ARE ABLE TO CATCH NEGATIVE REVIEWS WITHOUT LOSING TOO MANY POSITIVE ONES
SENTIMENT ANALYSIS HAS WORKED (SORT OF!)
''')

Predicted Class and Probabilities: 

[1 1 1 1 1]
[[1.92601108e-03 9.98073989e-01]
 [1.02738052e-03 9.98972619e-01]
 [2.74187510e-04 9.99725812e-01]
 [1.52557353e-02 9.84744265e-01]
 [5.55967682e-03 9.94440323e-01]]

Accuracy:  0.8931297709923665
Precision:  0.957968476357268
Recall:  0.9224283305227656
F1:  0.9398625429553266
Area under ROC Curve:  0.8962900505902192
GINI:  0.7925801011804383

True Negatives:  38
True Positives:  547
False Negatives:  46
False Positives:  24

Total Reviews:  655
Reviews Predicted as Negative:  84
Total Negative Reviews in Actuality:  62
Negative Reviews that were Correctly Predicted:  38

THIS IS DECENT! 
WE ARE ABLE TO CATCH NEGATIVE REVIEWS WITHOUT LOSING TOO MANY POSITIVE ONES
SENTIMENT ANALYSIS HAS WORKED (SORT OF!)

