In [1]:
import pandas as pd
import numpy as np
import re
import bs4 as bs

# download NLTK classifiers - these are cached locally on your machine
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# import ml classifiers
from nltk.tokenize import sent_tokenize # tokenizes sentences
from nltk.stem import PorterStemmer     # parsing/stemmer
from nltk.tag import pos_tag            # parts-of-speech tagging
from nltk.corpus import wordnet         # sentiment scores
from nltk.stem import WordNetLemmatizer # stem and context
from nltk.corpus import stopwords       # stopwords
from nltk.util import ngrams            # ngram iterator
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import tokenize as tok

# import word2vec
import gensim
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from textblob import TextBlob
import string

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sixumeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sixumeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
def conditions(s):
    if (s["Customer_Rating"] >=4):return "positive"
    elif (s["Customer_Rating"] <=2):return "negative"
    else:return "neutral"


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [16]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))

alphabet_string = string.ascii_lowercase

for e in ["volvo","car","xc","vehicle","bmw","audi"]+list(alphabet_string):
    eng_stopwords.add(e)

def review_cleaner(review, lemmatize=True, stem=False):
    if lemmatize == True and stem == True:
        raise RuntimeError("May not pass both lemmatize and stem flags")

    #1. Remove HTML tags
    review = bs.BeautifulSoup(review).text    

    #2. Use regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)

    #3. Remove punctuation
    review = re.sub("[^a-zA-Z]", " ",review)

    #4. Tokenize into words (all lower case)
    review = review.lower().split()

    #5. Remove stopwords, Lemmatize, Stem
    clean_review=[]
    for word in review:
        if word not in eng_stopwords:
            if lemmatize is True:
                word=wnl.lemmatize(word)
            elif stem is True:
                if word == 'oed':
                    continue
                word=ps.stem(word)
            clean_review.append(word)
            

    #6. Join the review to one sentence
    review_processed = ' '.join(clean_review)
    
    return review_processed


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


invalid escape sequence \)


invalid escape sequence \)


invalid escape sequence \)



In [17]:
df = pd.read_csv("data/Volvo_edmunds_10yrs.csv",lineterminator='\n').iloc[:,1:]
df['Review_Date'] = pd.to_datetime(df['Review_Date'],errors='coerce')
df["sentiment"] = df.apply(conditions, axis=1)
df["Review"] = df["Review"].apply(review_cleaner)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [18]:
# Helper functions
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])) 

# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
     
        
def clean_tweet(row):
    row = isURL.sub("",row)
    row = isRTusername.sub("",row)
    row = isEntity.sub("",row)
    return row

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in tok.sent_tokenize(text) for word in tok.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [19]:
def bigram(df):
    stoplist = stopwords.words('english') + ['though']
    from sklearn.feature_extraction.text import CountVectorizer
    c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
    # matrix of ngrams
    ngrams = c_vec.fit_transform(df)
    # count frequency of ngrams
    count_values = ngrams.toarray().sum(axis=0)
    # list of ngrams
    vocab = c_vec.vocabulary_
    df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
                ).rename(columns={0: 'frequency', 1:'bigram/trigram'})
    df_ngram['polarity'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).polarity)
    df_ngram['subjective'] = df_ngram['bigram/trigram'].apply(lambda x: TextBlob(x).subjectivity)
    return df_ngram


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [20]:
def lda(df,number_topics,number_words):

    corpus = df.tolist()
        # print(corpus)
    tf_vectorizer = CountVectorizer(max_df=0.9, min_df=0.00, stop_words="english", tokenizer=tokenize_only) # Use tf (raw term count) features for LDA.
    tf = tf_vectorizer.fit_transform(corpus)

    # Create and fit the LDA model
    model = LDA(n_components=number_topics, n_jobs=-1)
    id_topic = model.fit(tf)
    # Print the topics found by the LDA model
    topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=model, n_words=number_words)        
    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]

    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

    df_topic_keywords = df_topic_keywords.reset_index()
    df_topic_keywords['topic_index'] = df_topic_keywords['index'].str.split(' ', n = 1, expand = True)[[1]].astype('int')

    ############ get the dominat topic for each document in a data frame ###############
    # Create Document — Topic Matrix
    lda_output = model.transform(tf)
    # column names
    topicnames = ["Topic" + str(i) for i in range(model.n_components)]
    # index names
    docnames = ["Doc" + str(i) for i in range(len(corpus))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic   
    df_document_topic = df_document_topic.reset_index()

    return df_document_topic, df_topic_keywords


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [21]:
df["Vehicle_model"].unique()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



array(['C30', 'C70', 'S40', 'S60 Cross Country', 'S80', 'V50', 'XC70',
       'S60', 'S90', 'V60', 'V60 Cross Country', 'V90',
       'V90 Cross Country', 'XC40', 'XC60', 'XC90'], dtype=object)

In [22]:
# def filter_all(df,model):
#     filter_df = df[(df["Vehicle_model"]==model)
#     filter_df["Review"] = filter_df["Review"].apply(review_cleaner)
#     df_ngram = bigram(filter_df["Review"])
#     df_document_topic ,df_topic_keywords= lda(filter_df["Review"],5,5)
#     return df_ngram,df_document_topic,df_topic_keywords


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [23]:
# df_ngram,df_document_topic,df_topic_keywords = filter_all(df,"C30",True)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [24]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nlplot
from plotly.subplots import make_subplots
import plotly.express as px

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_colwidth', 5000)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [25]:
df_plot = df.groupby('sentiment').size().reset_index(name='count')
fig = px.bar(df_plot, y='count', x='sentiment', text='count')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(
    title=str('sentiment counts'),
    xaxis_title=str('sentiment'),
    width=700,
    height=500,
    )
fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [33]:
# initialize
npt = nlplot.NLPlot(df, target_col='Review')
npt_negative = nlplot.NLPlot(df.query('sentiment == "negative"'), target_col='Review')
npt_neutral = nlplot.NLPlot(df.query('sentiment == "neutral"'), target_col='Review')
npt_positive = nlplot.NLPlot(df.query('sentiment == "positive"'), target_col='Review')


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [34]:
stopwords = npt.get_stopword(top_n=30, min_freq=0)
print(stopwords)

['much', 'also', 'new', 'feature', 'love', 'well', 'year', 'would', 'mile', 'good', 'one', 'seat', 'like', 'road', 'great', 'get', 'system', 'issue', 'first', 'driving', 'back', 'time', 'really', 'safety', 'feel', 'comfortable', 'drive', 'still', 'engine', 'month']



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [35]:
# uni-gram
npt.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 1079/1079 [00:00<00:00, 14302.95it/s]


In [36]:
# bi-gram
npt.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 1079/1079 [00:00<00:00, 12346.69it/s]


In [37]:
# tri-gram
npt.bar_ngram(
    title='tri-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=3,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 1079/1079 [00:00<00:00, 11855.21it/s]


## Comparison of each sentiment¶


In [39]:
# positive/neutral/negative
fig_unigram_positive = npt_positive.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_unigram_neutral = npt_neutral.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_unigram_negative = npt_negative.bar_ngram(
    title='uni-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=1,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 810/810 [00:00<00:00, 18659.27it/s]
100%|██████████| 114/114 [00:00<00:00, 8199.59it/s]
100%|██████████| 155/155 [00:00<00:00, 8401.62it/s]


In [41]:
# subplot
trace1 = fig_unigram_positive['data'][0]
trace2 = fig_unigram_neutral['data'][0]
trace3 = fig_unigram_negative['data'][0]

fig = make_subplots(rows=1, cols=3, subplot_titles=('positive', 'neutral', 'negative'), shared_xaxes=False)
fig.update_xaxes(title_text='word count', row=1, col=1)
fig.update_xaxes(title_text='word count', row=1, col=2)
fig.update_xaxes(title_text='word count', row=1, col=3)

fig.update_layout(height=1100, width=1000, title_text='unigram positive vs neutral vs negative')
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)

fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [42]:
# positive/neutral/negative
fig_bigram_positive = npt_positive.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_bigram_neutral = npt_neutral.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)

fig_bigram_negative = npt_negative.bar_ngram(
    title='bi-gram',
    xaxis_label='word_count',
    yaxis_label='word',
    ngram=2,
    top_n=50,
    width=800,
    height=1100,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 810/810 [00:00<00:00, 13649.66it/s]
100%|██████████| 114/114 [00:00<00:00, 6949.97it/s]
100%|██████████| 155/155 [00:00<00:00, 7522.15it/s]


In [43]:
# subplot
trace1 = fig_bigram_positive['data'][0]
trace2 = fig_bigram_neutral['data'][0]
trace3 = fig_bigram_negative['data'][0]

fig = make_subplots(rows=1, cols=3, subplot_titles=('positive', 'neutral', 'negative'), shared_xaxes=False)
fig.update_xaxes(title_text='word count', row=1, col=1)
fig.update_xaxes(title_text='word count', row=1, col=2)
fig.update_xaxes(title_text='word count', row=1, col=3)

fig.update_layout(height=1100, width=1000, title_text='bigram positive vs neutral vs negative')
fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)

fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [44]:
npt.treemap(
    title='All sentiment Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 1079/1079 [00:00<00:00, 19064.30it/s]


In [45]:
npt_positive.treemap(
    title='Positive Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 810/810 [00:00<00:00, 20332.79it/s]


In [46]:
npt_neutral.treemap(
    title='Neutral Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 114/114 [00:00<00:00, 12246.14it/s]


In [47]:
npt_negative.treemap(
    title='Negative Tree of Most Common Words',
    ngram=1,
    stopwords=stopwords,
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

100%|██████████| 155/155 [00:00<00:00, 12081.94it/s]


In [49]:
## Histogram of the word count



`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [48]:
npt.word_distribution(
    title='number of words distribution'
)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
a