In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_csv("../input/Amazon_Unlocked_Mobile.csv")

In [None]:
df.head()
# Review votes in the table indicates number of people found the review helpful

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Brand Name'].value_counts().head()

In [None]:
df.dropna(inplace=True) # drop any rows with missing values

In [None]:
# assuming rating with 3 are neutral reviews
# so drop rows with rating = 3 (by chosing all the rows with rating!=3)

df = df[df['Rating']!=3]

In [None]:
# assuming rating with greater than 3 are rated as postive
# so we assign 1 to Positively rated and 0 to those are not
# if Rating > 3, then 'Positively Rated' = 1, else 'Positively Rated' = 0

df['Positively Rated'] = np.where(df['Rating']>3, 1, 0) 

In [None]:
df.head()

In [None]:
df['Positively Rated'].mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'],
                                                    df['Positively Rated'],
                                                    random_state=0)

In [None]:
y_train[0], X_train[0]

In [None]:
X_train.shape

In [None]:
# We'll need to convert text into a numeric so that scikit-learn can use
# The bag-of-words approach ignores structure and only counts how often each word occurs
# CountVectorizer use the bag-of-words by converting text into a matrix of token counts.

In [None]:
# First, we instantiate the CountVectorizer and fit it to our training data.

# Fitting the CountVectorizer consists of the 
#     tokenization of the trained data and 
#     building of the vocabulary

# Fitting the CountVectorizer 
#     tokenizes each document by finding 
#         all sequences of characters of 
#             at least two letters or 
#             numbers separated by word boundaries. 
# Converts everything to 
#     lowercase and 
#     builds a vocabulary using these tokens.

In [None]:
vect = CountVectorizer().fit(X_train)
vect

In [None]:
len(vect.get_feature_names())

In [None]:
vect.get_feature_names()[0:10]

In [None]:
vect.get_feature_names()[::4000]

In [None]:
# We use transform method to transform X_train to a document term matrix
# giving us the bag-of-word representation of X_train

# This representation is stored in a SciPy sparse matrix where 
#     each row corresponds to a document and 
#     each column a word from our training vocabulary.

# The entries in this matrix are the number of times each word appears in each document.

# Because the number of words in the vocabulary is so much larger 
# than the number of words that might appear in a single review, 
# most entries of this matrix are zero.

# and the shape will be 
#     number of document/rows(here in dataframe)/reviews(in this case) *
#     number of words in the vocabulary/tokens

In [None]:
# Here's a trivial example ... Let's suppose we have 3 documents:

#     Doc1: Hello, World, the sun is shining
#     Doc2: Hello world, the weather is nice
#     Doc3: Hello world, the wind is cold


# Then, our vocabulary would look like this (using 1-grams without stop word removal):

#     Vocabulary: [hello, world, the, wind, weather, sun, is, shining, nice, cold]


# The corresponding, binary feature vectors are:

#     Doc1: [1, 1, 1, 0, 0, 0, 1, 1, 0, 0]
#     Doc2: [1, 1, 1, 0, 0, 1, 0, 1, 1, 0]
#     Doc3: [1, 1, 1, 1, 0, 0, 1, 0, 0, 1]


# Which we use to construct the dense matrix / document term matrix:

#     [[1, 1, 1, 0, 0, 0, 1, 1, 0, 0]
#      [1, 1, 1, 0, 0, 1, 0, 1, 1, 0]
#      [1, 1, 1, 1, 0, 0, 1, 0, 0, 1]]

In [None]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

In [None]:
X_train_vectorized.shape

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [None]:
pred = model.predict(vect.transform(X_test))
print('roc accuracy score ', roc_auc_score(y_test, pred)

# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
print(model.predict(vect.transform(['not an issue, phone is working', 
                                    'an issue, phone is not working'])))

# TF IDF

In [None]:
# Tf–idf, or Term frequency-inverse document frequency
# allows us to weight terms based on how important they are to a document.
# high weight is given to terms that appear often in a particular document, 
# but don't appear often in the corpus. 

# Features with low tf–idf are either commonly used across all documents 
# or rarely used and only occur in long documents.

# Features with high tf–idf are frequently used within specific documents, 
# but rarely used across all documents.

In [None]:
# Similar to how we used CountVectorizer, 
# we'll instantiate the tf–idf vectorizer and fit it to our training data.

# mindf, which allows us to specify a minimum number of documents 
# in which a token needs to appear to become part of the vocabulary

In [None]:
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

In [None]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

pred = model.predict(vect.transform(X_test))

roc_auc_score(y_test, pred)

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
print(model.predict(vect.transform(['not an issue, phone is working', 
                                    'an issue, phone is not working'])))

# n-gram

In [None]:
# One way we can add some context is by adding sequences of word features known as n-grams. 

# For example, bigrams, which count pairs of adjacent words, 
# could give us features such as is working versus not working. 
# And trigrams, which give us triplets of adjacent words, 
# could give us features such as not an issue.

# To create these n-gram features, 
# we'll pass in a tuple to the parameter ngram_range, 
# where the values correspond to the minimum length and maximum lengths of sequences.

# For example, if I pass in the tuple, 1, 2, 
# CountVectorizer will create features using the individual words, 
# as well as the bigrams.

In [None]:
vect = TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

pred = model.predict(vect.transform(X_test))

roc_auc_score(y_test, pred)

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
print(model.predict(vect.transform(['not an issue, phone is working', 
                                    'an issue, phone is not working'])))