<a href="https://www.kaggle.com/renanfioramonte/alexa-nlp?scriptVersionId=88591747" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
review_df = pd.read_csv('../input/amazon-alexa-reviews/amazon_alexa.tsv', sep='\t')

### SUMMARY

In [None]:
review_df

In [None]:
review_df.info()

In [None]:
review_df.describe()

## EDA

In [None]:
sns.heatmap(review_df.isnull(), cbar=False)

In [None]:
review_df.isnull().sum()

In [None]:
review_df.hist(bins=30, figsize=(13,5), color='r')

In [None]:
review_df['length'] = review_df['verified_reviews'].apply(len)
review_df.head()

In [None]:
review_df['length'].plot(bins=100, kind='hist')

In [None]:
review_df.length.describe()

In [None]:
review_df[review_df['length'] == 2851]['verified_reviews'].iloc[0]

In [None]:
review_df[review_df['length'] == 1]['verified_reviews'].iloc[0]

#### Creating a df with the POSITIVE reviews

In [None]:
positive = review_df[review_df['feedback'] == 1]

In [None]:
positive.describe()

#### Creating a df with the NEGATIVE reviews

In [None]:
negative = review_df[review_df['feedback'] == 0]

In [None]:
negative.describe()

In [None]:
sns.countplot(review_df['feedback'])

In [None]:
sns.countplot(x = 'rating', data = review_df)

In [None]:
plt.figure(figsize=(30,15))
sns.barplot(x = 'variation', y = 'rating', data = review_df, palette = 'deep')

In [None]:
sentences = review_df['verified_reviews'].tolist()

In [None]:
## OBS.: to create a word cloud, all the strings must be in one variable ie list

In [None]:
sentences_as_one_string = ' '.join(sentences)

In [None]:
from wordcloud import WordCloud

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string))

In [None]:
negative_list = negative['verified_reviews'].tolist()
negative_sentences_as_one_string = ' '.join(negative_list)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(negative_sentences_as_one_string))

### DATA CLEANING

In [None]:
review_df.head()

In [None]:
review_df = review_df.drop(['date', 'rating', 'length'], axis = 1)
review_df.head()

### Dummy variables

In [None]:
review_df['variation'].unique()

In [None]:
X_cat = review_df[['variation']]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
onehotencoder = OneHotEncoder()

In [None]:
X_cat = onehotencoder.fit_transform(X_cat).toarray()

In [None]:
type(X_cat)

In [None]:
X_cat = pd.DataFrame(X_cat)

In [None]:
review_df.drop(['variation'], axis = 1, inplace=True)

In [None]:
review_df.head()

In [None]:
review_df = pd.concat([review_df, X_cat], axis = 1)
review_df

### REMOVING PUNCTUATION FROM TEXT

In [None]:
import string
string.punctuation

In [None]:
test = "You will feel like Tony Stark on this device. I added quite a few Alexa always loads them up quickly. Adding songs that you hear to specific playlists on Amazon Music is also a great feature.I can go on and on and this is only my second day of ownership.I was lucky to buy this for $100 on Prime Day, but I think for $150 is "

In [None]:
test_punct_removed = [char for char in test if char not in string.punctuation]

In [None]:
print(test_punct_removed)

In [None]:
test_punct_removed = ''.join(test_punct_removed)
test_punct_removed

### REMOVING STOP WORDS

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

#### REMOVING STOPWORDS FROM THE DATASET

In [None]:
test_punct_stop_removed = [word for word in test_punct_removed.split() if word.lower() not in stopwords.words('english')]

In [None]:
test_punct_stop_removed

### COUNT VECTORIZER

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
sample_data = ['This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?']

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(X.toarray())

### Cleaning text pipeline

In [None]:
def msg_cleaning(message):
    text = [char for char in message if char not in string.punctuation]
    text = ''.join(text)
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return text

In [None]:
#review_df_clean = review_df['verified_reviews'].apply(msg_cleaning)

In [None]:
#print(review_df_clean[3])

In [None]:
#print(review_df['verified_reviews'][3])

#### VECTORIZING IN THE NEW DF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=msg_cleaning)
review_count_vect = vectorizer.fit_transform(review_df['verified_reviews'])

In [None]:
len(vectorizer.get_feature_names())

### APPLYING TO THE ORIGINAL DF

In [None]:
review_df.drop(['verified_reviews'], axis = 1, inplace = True)

In [None]:
review = pd.DataFrame(review_count_vect.toarray())
type(review)

In [None]:
review_df = pd.concat([review_df, review], axis = 1)
review_df

#### CREATING X AND Y

In [None]:
X = review_df.drop(['feedback'], axis = 1)
Y = review_df['feedback']

### TRAINING AND ASSESSING METHOD NAIVE BAYES

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
Y_pred = naive_bayes.predict(X_test)

In [None]:
accuracy_score(Y_test, Y_pred)

In [None]:
cm = confusion_matrix(Y_test, Y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(Y_test, Y_pred))

### TRAINING AND ASESSING LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train, Y_train)

In [None]:
Y_pred = logistic.predict(X_test)

In [None]:
accuracy_score(Y_test, Y_pred)

In [None]:
print(classification_report(Y_test, Y_pred))

### TESTING THE MODEL IN ONE STRING

In [None]:
import pickle

In [None]:
with open('text_classifier_one.pkl', 'wb') as f:
    pickle.dump([logistic, onehotencoder, vectorizer], f)

In [None]:
with open('text_classifier_one.pkl', 'rb') as f:
    logistic, onehot, vect = pickle.load(f)

In [None]:
negative.head()

In [None]:
negative_text = negative.iloc[0:1, [2,3]]
negative_text

In [None]:
X_cat = negative_text[['variation']]
X_cat

In [None]:
X_cat = onehot.transform(X_cat).toarray()
X_cat = pd.DataFrame(X_cat)
X_cat

In [None]:
X_cat.index = negative_text.index

In [None]:
X_cat.index, negative_text.index

In [None]:
negative_df = pd.concat([negative_text, X_cat], axis = 1)

In [None]:
negative_df

In [None]:
negative_df.drop(['variation'], axis = 1, inplace=True)

In [None]:
negative_count_vect = vect.transform(negative_df['verified_reviews'])

In [None]:
review = pd.DataFrame(negative_count_vect.toarray())

In [None]:
negative_df.drop(['verified_reviews'], axis = 1, inplace=True)
negative_df

In [None]:
negative_df.index = review.index

In [None]:
negative_final = pd.concat([negative_df, review], axis = 1)

In [None]:
negative_final

In [None]:
logistic.predict(negative_final)

In [None]:
logistic.predict_proba(negative_final)