In [87]:
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

# Create a toy dataset
data = ['This movie is great',
        'I hated this movie',
        'The plot of the movie was good',
        'The movie was bad']

labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Create a pipeline
vectorizer = TfidfVectorizer()
data = vectorizer.fit_transform(data)
clf = LinearSVC()


# Fit the pipeline on the training data
clf.fit(data, labels)

# Visualize the contribution of each word to the prediction
doc = 'This movie was bad!'
print(doc)
eli5.show_prediction(clf, doc, target_names=['Negative', 'Positive'], vec=vectorizer)



This movie was bad!


Contribution?,Feature
0.411,x0
0.096,x10
0.06,x5
0.042,x9
-0.02,<BIAS>


In [84]:
feature_names = list(map(str, vectorizer.get_feature_names_out()))
weights = eli5.formatters.as_dataframe.explain_weights_df(clf, vec=vectorizer)
weights['Feature'] = ["0"]*len(weights)

feature_names = {i:v for v, i in vectorizer.vocabulary_.items()}

weights=weights.drop([6])
weights=weights.reset_index()
for i in range(11):
  weights['Feature'][i] = feature_names[i]
print(weights)


    index  target feature    weight Feature
0       0       1      x4  0.530685     bad
1       1       1      x2  0.530685    good
2       2       1      x7  0.407185   great
3       3       1      x6  0.407185   hated
4       4       1      x1  0.407185      is
5       5       1      x8  0.128279   movie
6       7       1      x9 -0.084386      of
7       8       1      x5 -0.183434    plot
8       9       1     x10 -0.192750     the
9      10       1      x3 -0.637718    this
10     11       1      x0 -0.651664     was


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weights['Feature'][i] = feature_names[i]


In [85]:
import spacy
from spacy import displacy
import re
nlp = spacy.load('en_core_web_sm')

sentence = "This movie was bad"

# Assume that you have a dictionary of weights assigned to each word in the sentence
word_weights = {}

for i in range(len(weights)):
  word_weights[weights['Feature'][i]] = weights['weight'][i]
# Create a list of tuples where each tuple contains a word and its corresponding weight
#word_weight_list = [(word, word_weights[word]) for word in sentence.split()]

word_weight_list = []
for word in sentence.split():
  if word in word_weights:
    word_weight_list.append((word, word_weights[word]))


def highlight_words(sentence, weights_dict):
    # Load the spacy language model
    nlp = spacy.load("en_core_web_sm")
    # Tokenize the sentence
    doc = nlp(sentence)
    # Initialize the HTML string
    html = ""
    # Loop through each token in the sentence
    for token in doc:
        # Get the weight of the token's corresponding feature
        weight = weights_dict.get(token.text, 0)
        # Set the color of the token based on its weight
        if weight > 0:
            color = "green"
        elif weight < 0:
            color = "red"
        else:
            color = "black"
        # Add the token to the HTML string with the appropriate color
        html += f'<span style="color:{color}">{token.text} </span>'
    return html

# Highlight the words in the sentence
highlighted_sentence = highlight_words(sentence, word_weights)

# Print the highlighted sentence

from IPython.display import HTML
HTML(highlighted_sentence)