<a target="_blank" href="../cluster" style="font-size:20px">All Applications (YARN)</a>

# Create SparkContext and SparkSession

https://spark.apache.org/docs/latest/rdd-programming-guide.html

http://spark.apache.org/docs/latest/sql-getting-started.html

In [None]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName='jupyter')

from pyspark.sql import SparkSession, Row
se = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-04-19 17:01:21,014 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2023-04-19 17:01:21,015 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
2023-04-19 17:01:21,981 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


# Wikipedia dataset

In [2]:
! ls -lh wiki

total 205M
-rw-r--r-- 1 jovyan root  61M Oct 18 15:47 categories.jsonl
-rw-r--r-- 1 jovyan root  387 Sep 26 22:14 README.txt
-rw-r--r-- 1 jovyan root 144M Sep 26 23:25 wiki.jsonl


In [3]:
! head -n 1 wiki/wiki.jsonl

{"title": "April", "text": "April\n\nApril is the fourth month of the year, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril's flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.\n\nIn common years, April starts on the same day of the week as October of the previous year, and in leap years, May 

In [4]:
! head -n 5 wiki/categories.jsonl

{"page_id":1,"category":"Months"}
{"page_id":2,"category":"Months"}
{"page_id":6,"category":"Art"}
{"page_id":6,"category":"Basic_English_850_words"}
{"page_id":6,"category":"CS1_Russian-language_sources_(ru)"}


# Copy files to HDFS

In [5]:
! hadoop fs -copyFromLocal wiki /

In [6]:
! hadoop fs -ls -h /wiki

Found 3 items
-rw-r--r--   1 jovyan supergroup        387 2020-10-24 18:46 /wiki/README.txt
-rw-r--r--   1 jovyan supergroup     60.9 M 2020-10-24 18:46 /wiki/categories.jsonl
-rw-r--r--   1 jovyan supergroup    143.4 M 2020-10-24 18:46 /wiki/wiki.jsonl


# Bag of words model for text vectorization

https://en.wikipedia.org/wiki/Bag-of-words_model

<img width=600px src='images/bow.png'>

# Logistic regression for text classification

https://en.wikipedia.org/wiki/Logistic_regression

<img width=600px src='images/lr.png'>

# Creating a dictionary of words (WordCount task)

In [2]:
import re
import string

def tokenize(text):
    text = re.sub(f'[^{re.escape(string.printable)}]', ' ', text)  # replace unprintable characters with a space
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)  # and punctuation
    words = text.lower().split()
    return words

In [3]:
import json

def mapper(line):
    text = json.loads(line)['text']
    words = tokenize(text)
    return [(word, 1) for word in set(words)]

In [4]:
%%time
word_counts = (
    sc.textFile("hdfs:///wiki/wiki.jsonl")
    .flatMap(mapper)
    .reduceByKey(lambda a, b: a + b)
    .collect()
)

CPU times: user 106 ms, sys: 19.5 ms, total: 125 ms
Wall time: 22.4 s


In [5]:
word_counts[:5]

[('continued', 2222),
 ('marguerite', 73),
 ('family', 7907),
 ('earl', 474),
 ('charge', 1236)]

In [6]:
len(word_counts)

306564

In [7]:
top_word_counts = sorted(word_counts, key=lambda x: -x[1])[:50000]

In [8]:
top_word_counts[:5]

[('the', 143585),
 ('in', 134975),
 ('a', 134315),
 ('of', 128899),
 ('is', 125063)]

In [9]:
top_word_counts[-5:]

[('pals', 7),
 ('subgenera', 7),
 ('horwitz', 7),
 ('crocodylomorphs', 7),
 ('khabarovsk', 7)]

In [10]:
len(top_word_counts)

50000

In [11]:
# indexes are needed for vectorization of texts
word_to_index = {word: index for index, (word, count) in enumerate(top_word_counts)}

In [12]:
list(word_to_index.items())[:5]

[('the', 0), ('in', 1), ('a', 2), ('of', 3), ('is', 4)]

# Vectorize texts (the "bag of words" model)

In [13]:
from collections import Counter
Counter(["a", "a", "b"])

Counter({'a': 2, 'b': 1})

In [19]:
# first option: the word_to_index dictionary will be serialized using pickle along with the function
import numpy as np

def mapper(line):
    j = json.loads(line)
    text = j['text']
    words = tokenize(text)
    indices = []
    values = []
    for word, count in Counter(words).items():
        if word in word_to_index:
            index = word_to_index[word]
            indices.append(index)
            tf = count / float(len(words))
            values.append(tf)
    return np.array(indices), np.array(values)

In [20]:
%%time
(
    sc.textFile("hdfs:///wiki/wiki.jsonl")
    .map(mapper)
    .take(1)
)

CPU times: user 338 ms, sys: 4.54 ms, total: 343 ms
Wall time: 543 ms


[(array([  119,     4,     0,   673,  1165,     3,   107,     5,   460,
            82,   109,    57,     8,    27,   150,   654,     7,    37,
           240,   459,   630,  1806,     9,   129,   222,  1185,    11,
           115,  5341,    90,     1,  5614,    72,  1679,   131,    15,
          2308,    21,  2576, 13008,  9614,    43, 32830,  4556,   575,
         10878,   357,    20,    30,   117,    19,   120,   126,   132,
            93,   352,   166,    41,   230,  2350,  1351,  1198, 11111,
          1665,   315,  1266,   127,  1347,  7241,   136,  2294,    39,
           106,   308,     2,  1516,   374,  3895,    13,  4121,  1021,
           410,  7638,  3275,  6410,    89,   397,    78,  1174,    14,
           833,   418,   628,  5464,  1619,   198,   227,   487, 13721,
           718,  3263,   636,     6,   564,   108,   187,   611,  2482,
           615,    53,   454,    64,  1259,   341,  4367,   353,    24,
            44,  1184,  1720,  2828,  2845,   145,   354,  2572,

In [21]:
# second option: broadcast variable
word_to_index_broadcast = sc.broadcast(word_to_index)

Broadcast variables are useful when you want to broadcast the same data to all executors:
- dictionary in ML algorithm
- vector of weights in ML algorithm

Executors have **read-only** access to this data

Send once and can be used multiple times

In [22]:
def mapper(line):
    j = json.loads(line)
    text = j['text']
    words = tokenize(text)
    indices = []
    values = []
    for word, count in Counter(words).items():
        if word in word_to_index_broadcast.value:
            index = word_to_index_broadcast.value[word]
            indices.append(index)
            tf = count / float(len(words))
            values.append(tf)
    return np.array(indices), np.array(values)

In [23]:
%%time
(
    sc.textFile("hdfs:///wiki/wiki.jsonl")
    .map(mapper)
    .take(1)
)

CPU times: user 6.62 ms, sys: 0 ns, total: 6.62 ms
Wall time: 163 ms


[(array([  119,     4,     0,   673,  1165,     3,   107,     5,   460,
            82,   109,    57,     8,    27,   150,   654,     7,    37,
           240,   459,   630,  1806,     9,   129,   222,  1185,    11,
           115,  5341,    90,     1,  5614,    72,  1679,   131,    15,
          2308,    21,  2576, 13008,  9614,    43, 32830,  4556,   575,
         10878,   357,    20,    30,   117,    19,   120,   126,   132,
            93,   352,   166,    41,   230,  2350,  1351,  1198, 11111,
          1665,   315,  1266,   127,  1347,  7241,   136,  2294,    39,
           106,   308,     2,  1516,   374,  3895,    13,  4121,  1021,
           410,  7638,  3275,  6410,    89,   397,    78,  1174,    14,
           833,   418,   628,  5464,  1619,   198,   227,   487, 13721,
           718,  3263,   636,     6,   564,   108,   187,   611,  2482,
           615,    53,   454,    64,  1259,   341,  4367,   353,    24,
            44,  1184,  1720,  2828,  2845,   145,   354,  2572,

In [24]:
# third option: save to file yourself
import pickle
pickle.dump(word_to_index, open("word_to_index.pickle", 'wb'))

In [25]:
def mapper(lines):
    word_to_index_loaded = pickle.load(open("word_to_index.pickle", 'rb'))
    for line in lines:
        j = json.loads(line)
        text = j['text']
        words = tokenize(text)
        indices = []
        values = []
        for word, count in Counter(words).items():
            if word in word_to_index_loaded:
                index = word_to_index_loaded[word]
                indices.append(index)
                tf = count / float(len(words))
                values.append(tf)
        yield np.array(indices), np.array(values)

In [26]:
sc.addFile("word_to_index.pickle")

In [27]:
%%time
(
    sc.textFile("hdfs:///wiki/wiki.jsonl")
    .mapPartitions(mapper)
    .take(1)
)

CPU times: user 4.72 ms, sys: 1.65 ms, total: 6.37 ms
Wall time: 206 ms


[(array([  119,     4,     0,   673,  1165,     3,   107,     5,   460,
            82,   109,    57,     8,    27,   150,   654,     7,    37,
           240,   459,   630,  1806,     9,   129,   222,  1185,    11,
           115,  5341,    90,     1,  5614,    72,  1679,   131,    15,
          2308,    21,  2576, 13008,  9614,    43, 32830,  4556,   575,
         10878,   357,    20,    30,   117,    19,   120,   126,   132,
            93,   352,   166,    41,   230,  2350,  1351,  1198, 11111,
          1665,   315,  1266,   127,  1347,  7241,   136,  2294,    39,
           106,   308,     2,  1516,   374,  3895,    13,  4121,  1021,
           410,  7638,  3275,  6410,    89,   397,    78,  1174,    14,
           833,   418,   628,  5464,  1619,   198,   227,   487, 13721,
           718,  3263,   636,     6,   564,   108,   187,   611,  2482,
           615,    53,   454,    64,  1259,   341,  4367,   353,    24,
            44,  1184,  1720,  2828,  2845,   145,   354,  2572,

# We consider the coverage of words by a dictionary

In [28]:
# accumulator variables
all_words = sc.accumulator(0)
covered_words = sc.accumulator(0)

Accumulator variables are useful when you need a counter (adder) of events on executors:
- number of unknown words
- number of corrupted records

Only the driver can read the totals.

For executors, access to the counter is **write-only**.

In [29]:
def mapper(line):
    global all_words
    global covered_words
    j = json.loads(line)
    text = j['text']
    words = tokenize(text)
    indices = []
    values = []
    for word, count in Counter(words).items():
        all_words += count
        if word in word_to_index:
            covered_words += count
            index = word_to_index[word]
            indices.append(index)
            tf = count / float(len(words))
            values.append(tf)
    return np.array(indices), np.array(values)

In [30]:
%%time
(
    sc.textFile("hdfs:///wiki/wiki.jsonl")
    .map(mapper)
    .count()
)

CPU times: user 330 ms, sys: 0 ns, total: 330 ms
Wall time: 23.6 s


154259

In [31]:
print(covered_words.value)
print(all_words.value)

21603362
22321968


In [32]:
print("coverage:", covered_words.value / float(all_words.value))

coverage: 0.9678072291833767


# File with article categories

In [14]:
wiki = se.read.json("hdfs:///wiki/wiki.jsonl")
wiki.registerTempTable("wiki")
wiki.limit(2).toPandas()

Unnamed: 0,id,text,title,url
0,1,April\n\nApril is the fourth month of the year...,April,https://simple.wikipedia.org/wiki?curid=1
1,2,August\n\nAugust (Aug.) is the eighth month of...,August,https://simple.wikipedia.org/wiki?curid=2


In [15]:
categories = se.read.json("hdfs:///wiki/categories.jsonl")
categories.registerTempTable("categories")
categories.limit(2).toPandas()

Unnamed: 0,category,page_id
0,Months,1
1,Months,2


In [68]:
import pandas as pd
pd.set_option("display.max_rows", 1000)

se.sql("""
select 
    category,
    count(*) as cnt
from categories
group by category
order by cnt desc
""").limit(100).toPandas()

Unnamed: 0,category,cnt
0,Living_people,23886
1,Commons_category_link_is_on_Wikidata,18754
2,Pages_translated_from_English_Wikipedia,16860
3,Coordinates_on_Wikidata,16084
4,People_stubs,14828
5,Articles_with_hCards,13823
6,United_States_geography_stubs,12607
7,Commons_category_link_from_Wikidata,9825
8,American_people_stubs,9689
9,Sportspeople_stubs,8749


# Joining categories

In [36]:
# simple join
joined = se.sql("""
select
    *
from
    wiki join categories on wiki.id == categories.page_id
""")
joined.limit(5).toPandas()

Unnamed: 0,id,text,title,url,category,page_id
0,474,"Montreal\n\nMontreal (, spelled ""Montréal"" in ...",Montreal,https://simple.wikipedia.org/wiki?curid=474,Articles_containing_French-language_text,474
1,474,"Montreal\n\nMontreal (, spelled ""Montréal"" in ...",Montreal,https://simple.wikipedia.org/wiki?curid=474,CS1_French-language_sources_(fr),474
2,474,"Montreal\n\nMontreal (, spelled ""Montréal"" in ...",Montreal,https://simple.wikipedia.org/wiki?curid=474,Coordinates_on_Wikidata,474
3,474,"Montreal\n\nMontreal (, spelled ""Montréal"" in ...",Montreal,https://simple.wikipedia.org/wiki?curid=474,Montreal,474
4,474,"Montreal\n\nMontreal (, spelled ""Montréal"" in ...",Montreal,https://simple.wikipedia.org/wiki?curid=474,Olympic_cities,474


In [37]:
joined.explain()

== Physical Plan ==
*(5) SortMergeJoin [cast(id#7 as bigint)], [page_id#27L], Inner
:- *(2) Sort [cast(id#7 as bigint) ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(cast(id#7 as bigint), 200), true, [id=#127]
:     +- *(1) Project [id#7, text#8, title#9, url#10]
:        +- *(1) Filter isnotnull(id#7)
:           +- FileScan json [id#7,text#8,title#9,url#10] Batched: false, DataFilters: [isnotnull(id#7)], Format: JSON, Location: InMemoryFileIndex[hdfs://localhost:9000/wiki/wiki.jsonl], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:string,text:string,title:string,url:string>
+- *(4) Sort [page_id#27L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(page_id#27L, 200), true, [id=#136]
      +- *(3) Project [category#26, page_id#27L]
         +- *(3) Filter isnotnull(page_id#27L)
            +- FileScan json [category#26,page_id#27L] Batched: false, DataFilters: [isnotnull(page_id#27L)], Format: JSON, Location: InMemoryFileIndex[hdfs://l

In [38]:
# broadcast join (https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-hints.html#join-hints)
joined = se.sql("""
select /*+ BROADCAST(categories) */
    *
from
    wiki join categories on wiki.id == categories.page_id
""")
joined.limit(5).toPandas()

Unnamed: 0,id,text,title,url,category,page_id
0,1,April\n\nApril is the fourth month of the year...,April,https://simple.wikipedia.org/wiki?curid=1,Months,1
1,2,August\n\nAugust (Aug.) is the eighth month of...,August,https://simple.wikipedia.org/wiki?curid=2,Months,2
2,6,Art\n\nArt is a creative activity that express...,Art,https://simple.wikipedia.org/wiki?curid=6,Pages_using_ISBN_magic_links,6
3,6,Art\n\nArt is a creative activity that express...,Art,https://simple.wikipedia.org/wiki?curid=6,Non-verbal_communication,6
4,6,Art\n\nArt is a creative activity that express...,Art,https://simple.wikipedia.org/wiki?curid=6,Commons_category_link_from_Wikidata,6


In [39]:
joined.explain()

== Physical Plan ==
*(2) BroadcastHashJoin [cast(id#7 as bigint)], [page_id#27L], Inner, BuildRight
:- *(2) Project [id#7, text#8, title#9, url#10]
:  +- *(2) Filter isnotnull(id#7)
:     +- FileScan json [id#7,text#8,title#9,url#10] Batched: false, DataFilters: [isnotnull(id#7)], Format: JSON, Location: InMemoryFileIndex[hdfs://localhost:9000/wiki/wiki.jsonl], PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:string,text:string,title:string,url:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true])), [id=#208]
   +- *(1) Project [category#26, page_id#27L]
      +- *(1) Filter isnotnull(page_id#27L)
         +- FileScan json [category#26,page_id#27L] Batched: false, DataFilters: [isnotnull(page_id#27L)], Format: JSON, Location: InMemoryFileIndex[hdfs://localhost:9000/wiki/categories.jsonl], PartitionFilters: [], PushedFilters: [IsNotNull(page_id)], ReadSchema: struct<category:string,page_id:bigint>




# Skewed Key problem by example

In [40]:
se.udf.register("tokenize", tokenize, "array<string>")

<function __main__.tokenize(text)>

In [41]:
joined = se.sql("""
select
    explode(tokenize(wiki.text)) as token,
    categories.category
from
    wiki join categories on wiki.id == categories.page_id
where categories.category in ('Living_people', 'Movies_based_on_books')
""")
joined.limit(2).toPandas()

Unnamed: 0,token,category
0,50,Living_people
1,cent,Living_people


In [42]:
joined.write.parquet("hdfs:///token_category.parquet")

In [43]:
df = se.read.parquet("hdfs:///token_category.parquet")
df.show(2)

+--------+-------------+
|   token|     category|
+--------+-------------+
|   sarah|Living_people|
|michelle|Living_people|
+--------+-------------+
only showing top 2 rows



In [44]:
df.registerTempTable("token_category")

In [45]:
se.sql("""
select 
    category,
    count(*) as cnt
from token_category
group by category
""").show()

+--------------------+-------+
|            category|    cnt|
+--------------------+-------+
|Movies_based_on_b...|  89520|
|       Living_people|2468253|
+--------------------+-------+



In [46]:
%%time
import time
import random

def slow_count(tokens):
    count = 0
    for token in tokens:
        count += 1
        # slow down to show
        if random.random() < 0.1:
            time.sleep(0.00000001)
    return count

(
    df.rdd
    .map(lambda x: (x.category, x.token))
    .groupByKey(numPartitions=10)
    .map(lambda x: (x[0], slow_count(x[1])))
).collect()

[('Movies_based_on_books', 89520), ('Living_people', 2468253)]

In [47]:
%%time
sorted((
    df.rdd
    .map(lambda x: ((x.category, random.randint(1, 10)), x.token))  # made a dummy key
    .groupByKey(numPartitions=10)
    .map(lambda x: (x[0], slow_count(x[1])))
).collect())  # it remains only to sum the values

[(('Living_people', 1), 246179),
 (('Living_people', 2), 245763),
 (('Living_people', 3), 247088),
 (('Living_people', 4), 247824),
 (('Living_people', 5), 247187),
 (('Living_people', 6), 246583),
 (('Living_people', 7), 246985),
 (('Living_people', 8), 247068),
 (('Living_people', 9), 247010),
 (('Living_people', 10), 246566),
 (('Movies_based_on_books', 1), 9014),
 (('Movies_based_on_books', 2), 9019),
 (('Movies_based_on_books', 3), 8927),
 (('Movies_based_on_books', 4), 8873),
 (('Movies_based_on_books', 5), 8981),
 (('Movies_based_on_books', 6), 8899),
 (('Movies_based_on_books', 7), 8973),
 (('Movies_based_on_books', 8), 8814),
 (('Movies_based_on_books', 9), 9016),
 (('Movies_based_on_books', 10), 9004)]

# Set up logistic regression

In [16]:
import numpy as np

In [17]:
joined = se.sql("""
select
    wiki.text,
    cast(categories.category == 'American_movie_actors' as int) as target
from
    wiki join categories on wiki.id == categories.page_id
where categories.category in ('American_movie_actors', 'US_Democratic_Party_politicians')
""")
joined.limit(2).toPandas()

Unnamed: 0,text,target
0,"50 Cent\n\n50 Cent (also known as Fitty"" or ""F...",1
1,"Danny Jacobs (actor)\n\nDaniel Charles Jacobs,...",1


In [18]:
def mapper(row):
    words = tokenize(row.text)
    indices = []
    values = []
    for word, count in Counter(words).items():
        if word in word_to_index:
            index = word_to_index[word]
            indices.append(index)
            tf = count / float(len(words))
            values.append(tf)
    return np.array(indices), np.array(values), row.target

In [19]:
dataset = joined.rdd.map(mapper)
dataset.cache()  # cache dataset in RAM
dataset.count()

5260

In [20]:
dataset.take(1)

[(array([  531,  7750,    20,    33,    11,    25,  3244,     4,    13,
            35,  4850,   733,  2384,  2618,  2669,   287,  4920,     5,
          1190,   737,    18,     6,    22,     1,  4114,    53,   225,
            62,     9,   115,   164,   778,    28,   388,    78,    42,
          1171,  6981,  1766,  1438,    75,   517,  2089,   338,   481,
          3066,    51,   313,   194,    72,   187,   802,   413,     2,
          3177,    45, 45674,  1739,   324,     7,   365,  1677,   208,
           397,  1272,   886,   226,   106,  2135,  9909,    30,    73,
           704,   236,   445,   648,    12,  1928,  4292,  5057,   201,
           303,     0,   129,   271,    48,   190,   954,  1321,   100,
            94,    23,   613,  2840,     3,   712,   381,   270,  1093,
          1275,  1468,   317,  2899,  3988,     8,   182,    27,   440,
           468,  1493,   793, 23173,   188,  1530,   150,   459,   133,
           134,   335,   963,  1805,    10,  1169,  9657,   102,

In [21]:
def sigmoid(x):
    if x >= 0:
        return 1. / (1. + np.exp(-x))
    else:
        return np.exp(x) / (1. + np.exp(x))

In [57]:
def compute_gradient(weights_broadcast, loss, examples):
    # here we accumulate the contribution to the gradient
    gradient = np.zeros(len(weights_broadcast.value))
    
    for example in examples:
        indices, values, target = example

        # make a prediction with the current weights
        p = sigmoid(values.dot(weights_broadcast.value[indices]))

        # add to gradient accumulator
        gradient[indices] += values * (p - target)

        # count losses
        p = np.clip(p, 1e-15, 1-1e-15)
        loss.add(-(target * np.log(p) + (1 - target) * np.log(1 - p)))
    
    yield gradient

In [23]:
# number of examples
N = dataset.count()

In [24]:
from functools import partial
import numpy as np

# random weights
weights = np.random.random(len(word_to_index))

# Gradient Descent Epoch
for i in range(10):
    weights_broadcast = sc.broadcast(weights)
    loss = sc.accumulator(0.0)
    
    # calculate the gradient
    gradient = (
        dataset
        .coalesce(2)  # merge 200 cached partitions into 2
        .mapPartitions(partial(compute_gradient, weights_broadcast, loss))
        .reduce(lambda a, b: a + b)
    )

    # update the weights
    weights -= 0.05 * gradient
    
    weights_broadcast.destroy()
    
    print("epoch:", i, "loss:", loss.value / N)

epoch: 0 loss: 0.58625134286
epoch: 1 loss: 0.527839043397
epoch: 2 loss: 0.495466974821
epoch: 3 loss: 0.47081708081
epoch: 4 loss: 0.449695793506
epoch: 5 loss: 0.430857922738
epoch: 6 loss: 0.413814406772
epoch: 7 loss: 0.398299721855
epoch: 8 loss: 0.384125692498
epoch: 9 loss: 0.371138697296


In [25]:
# important words for American_movie_actors class
sorted([(weights[index], word) for word, index in word_to_index.items()])[-1:-15:-1]

[(11.100239587965177, 'in'),
 (7.6522764351610197, 'and'),
 (5.7339620979988899, 'she'),
 (5.3439673788711248, 'actor'),
 (4.3887261601815677, 'known'),
 (4.2384335148585421, 'born'),
 (3.9450777265003576, 'actress'),
 (3.8833970934648274, 'movie'),
 (3.4859693374264453, 'television'),
 (3.3010514645270699, 'is'),
 (3.0167908274737556, 'for'),
 (2.9851047406444557, 'her'),
 (2.939186703679157, 'an'),
 (2.8649770593271122, 'movies')]

In [26]:
# important words for US_Democratic_Party_politicians class
sorted([(weights[index], word) for word, index in word_to_index.items()])[:15]

[(-7.2215606516983524, 'of'),
 (-4.4236326599835056, 'politician'),
 (-3.7006406410501556, 'from'),
 (-2.787617534816401, 'to'),
 (-2.3817626414285775, 'democratic'),
 (-2.3511756609059202, 'states'),
 (-1.9544975672977596, 'he'),
 (-1.9125664117701597, 'united'),
 (-1.8476226873040478, 'governor'),
 (-1.7735524217387413, 'member'),
 (-1.7617972029084346, 'served'),
 (-1.6199405046473707, 'party'),
 (-1.5605207418086355, 'house'),
 (-1.3142585335306527, 'the'),
 (-1.2714361492602533, 'u')]