<a href="https://colab.research.google.com/github/smalaboy/projet_ter/blob/main/twitter_topics_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !wget -O twitter_training2.zip https://drive.google.com/u/0/uc?id=1EPin6POZkj1S4xBhRv_EYHmlCF_fO9tk&export=download

In [2]:
!ls

drive	     spark-3.2.1-bin-hadoop2.7	    training.csv
sample_data  spark-3.2.1-bin-hadoop2.7.tgz


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz
!tar xf spark-3.2.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [5]:
!ls

drive	     spark-3.2.1-bin-hadoop2.7	    spark-3.2.1-bin-hadoop2.7.tgz.1
sample_data  spark-3.2.1-bin-hadoop2.7.tgz  training.csv


In [4]:
!unzip /content/drive/MyDrive/datasets/twitter_training.zip

Archive:  /content/drive/MyDrive/datasets/twitter_training.zip
  inflating: training.csv            


In [5]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType

In [6]:
schema = StructType()\
              .add("No", IntegerType(), True)\
              .add("id", StringType(), True)\
              .add("datetime", StringType(), True)\
              .add("query", StringType(), True)\
              .add("user", StringType(), True)\
              .add("text", StringType(), True)
dataset = spark.read.csv('training.csv', schema=schema)
dataset.printSchema()
dataset.head(5)

root
 |-- No: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)



[Row(No=0, id='1467810369', datetime='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"),
 Row(No=0, id='1467810672', datetime='Mon Apr 06 22:19:49 PDT 2009', query='NO_QUERY', user='scotthamilton', text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"),
 Row(No=0, id='1467810917', datetime='Mon Apr 06 22:19:53 PDT 2009', query='NO_QUERY', user='mattycus', text='@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'),
 Row(No=0, id='1467811184', datetime='Mon Apr 06 22:19:57 PDT 2009', query='NO_QUERY', user='ElleCTF', text='my whole body feels itchy and like its on fire '),
 Row(No=0, id='1467811193', datetime='Mon Apr 06 22:19:57 PDT 2009', query='NO_QUERY', user='Karoli', text="@nationwideclass no, it's not behaving at all. i'm mad. why am 

**Preprocessing tweets**

In [7]:
import re
from pyspark.sql.functions import udf
from pyspark.sql.functions import to_timestamp
import pyspark.sql.types as T
from pyspark.ml.feature import StopWordsRemover


In [8]:
# Removes twitter handles
def remove_users(tweet):
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    return tweet

In [9]:
# Removes punctuation
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@â'
def remove_punctuation(tweet):
    tweet = re.sub(f'[{punctuation}]+', ' ', tweet) 
    return tweet

In [10]:
# Removes numbers
def remove_number(tweet):
    tweet = re.sub('([0-9]+)', '', tweet) 
    return tweet

In [11]:
# Removes hastags
def remove_hashtag(tweet):
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    return tweet

In [12]:
def remove_links(tweet):
    tweet = re.sub(r'http\S+', '', tweet) 
    tweet = re.sub(r'bit.ly/\S+', '', tweet) 
    tweet = tweet.strip('[link]') 
    return tweet

In [13]:
# User defined functions registration
remove_users=udf(remove_users)
remove_punctuation=udf(remove_punctuation)
remove_number=udf(remove_number)
remove_hashtag=udf(remove_hashtag)
remove_links=udf(remove_links)

In [14]:
processed_tweets_1 = dataset.withColumn('processed_text', remove_users(dataset.text))
processed_tweets_1 = processed_tweets_1.withColumn('processed_text', remove_punctuation(processed_tweets_1.processed_text))
processed_tweets_1 = processed_tweets_1.withColumn('processed_text', remove_number(processed_tweets_1.processed_text))
processed_tweets_1 = processed_tweets_1.withColumn('processed_text', remove_hashtag(processed_tweets_1.processed_text))
processed_tweets_1 = processed_tweets_1.withColumn('processed_text', remove_links(processed_tweets_1.processed_text))

In [27]:
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# print(stopwords.words('english')[:20])
stopwords_list = list(set(StopWordsRemover().getStopWords() + stopwords.words('english') + ["http", "https"]))
print(len(stopwords_list), stopwords_list[:20])
stopWordsRemover = StopWordsRemover(stopWords=stopwords_list)
# print(StopWordsRemover().getStopWords())

213 ['was', "let's", 'from', 'how', 'all', 'same', 'aren', "it's", 'down', 'needn', 'after', 'an', "weren't", 'myself', 'through', 'own', 'him', 'whom', 're', "doesn't"]


In [20]:
processed_tweets_1.printSchema()
processed_tweets_1.head(5)

root
 |-- No: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- processed_text: string (nullable = true)



[Row(No=0, id='1467810369', datetime='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D", processed_text=' http twitpic com yzl   Awww  that s a bummer   You shoulda got David Carr of Third Day to do it   D'),
 Row(No=0, id='1467810672', datetime='Mon Apr 06 22:19:49 PDT 2009', query='NO_QUERY', user='scotthamilton', text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", processed_text='s upset that he can t update his Facebook by texting it  and might cry as a result  School today also  Blah '),
 Row(No=0, id='1467810917', datetime='Mon Apr 06 22:19:53 PDT 2009', query='NO_QUERY', user='mattycus', text='@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds', processed_text=' I dived many times for the ball  Managed to save    The rest go out o

In [21]:
from pyspark.ml.feature import RegexTokenizer

In [24]:
# Tokenize and filter out words with len < 3
tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength(4).setInputCol("processed_text").setOutputCol("tokens")

In [25]:
tokenized_tweets = tokenizer.transform(processed_tweets_1)

In [26]:
tokenized_tweets.printSchema()
tokenized_tweets.head(5)

root
 |-- No: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- processed_text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



[Row(No=0, id='1467810369', datetime='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D", processed_text=' http twitpic com yzl   Awww  that s a bummer   You shoulda got David Carr of Third Day to do it   D', tokens=['http', 'twitpic', 'awww', 'that', 'bummer', 'shoulda', 'david', 'carr', 'third']),
 Row(No=0, id='1467810672', datetime='Mon Apr 06 22:19:49 PDT 2009', query='NO_QUERY', user='scotthamilton', text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", processed_text='s upset that he can t update his Facebook by texting it  and might cry as a result  School today also  Blah ', tokens=['upset', 'that', 'update', 'facebook', 'texting', 'might', 'result', 'school', 'today', 'also', 'blah']),
 Row(No=0, id='1467810917', datetime='Mon Apr 06 22:19:53 PDT 2009', query='NO_QUERY

In [29]:
# Stopwords removal
stopWordsRemover.setInputCol("tokens")
stopWordsRemover.setOutputCol("final_tokens")
tokenized_tweets_2 = stopWordsRemover.transform(tokenized_tweets)

In [30]:
tokenized_tweets_2.printSchema()
tokenized_tweets_2.head(5)

root
 |-- No: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- processed_text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- final_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



[Row(No=0, id='1467810369', datetime='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D", processed_text=' http twitpic com yzl   Awww  that s a bummer   You shoulda got David Carr of Third Day to do it   D', tokens=['http', 'twitpic', 'awww', 'that', 'bummer', 'shoulda', 'david', 'carr', 'third'], final_tokens=['twitpic', 'awww', 'bummer', 'shoulda', 'david', 'carr', 'third']),
 Row(No=0, id='1467810672', datetime='Mon Apr 06 22:19:49 PDT 2009', query='NO_QUERY', user='scotthamilton', text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", processed_text='s upset that he can t update his Facebook by texting it  and might cry as a result  School today also  Blah ', tokens=['upset', 'that', 'update', 'facebook', 'texting', 'might', 'result', 'school', 'today', 'also', 'blah'], fina

In [31]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [32]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize(row):
    row = [lemmatizer.lemmatize(word,'v') for word in row]
    return row

lemmatization_udf = udf(lemmatize, T.ArrayType(T.StringType()))

In [33]:
tokenized_tweets=tokenized_tweets_2.withColumn('tokens_lemma', lemmatization_udf(tokenized_tweets_2['final_tokens']))

In [34]:
tokenized_tweets.printSchema()
tokenized_tweets.head(5)

root
 |-- No: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- query: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- processed_text: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- final_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokens_lemma: array (nullable = true)
 |    |-- element: string (containsNull = true)



[Row(No=0, id='1467810369', datetime='Mon Apr 06 22:19:45 PDT 2009', query='NO_QUERY', user='_TheSpecialOne_', text="@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D", processed_text=' http twitpic com yzl   Awww  that s a bummer   You shoulda got David Carr of Third Day to do it   D', tokens=['http', 'twitpic', 'awww', 'that', 'bummer', 'shoulda', 'david', 'carr', 'third'], final_tokens=['twitpic', 'awww', 'bummer', 'shoulda', 'david', 'carr', 'third'], tokens_lemma=['twitpic', 'awww', 'bummer', 'shoulda', 'david', 'carr', 'third']),
 Row(No=0, id='1467810672', datetime='Mon Apr 06 22:19:49 PDT 2009', query='NO_QUERY', user='scotthamilton', text="is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", processed_text='s upset that he can t update his Facebook by texting it  and might cry as a result  School today also  Blah ', tokens=['upset', 'that', 'update', '

In [35]:
preprocessed_test_tweets = tokenized_tweets.sample(withReplacement=False, fraction=0.1)
preprocessed_test_tweets.write.json("preprocessed_test_tweets")

In [36]:
!tar cvf preprocessed_test_tweets.tar preprocessed_test_tweets

preprocessed_test_tweets/
preprocessed_test_tweets/.part-00000-6cea9b52-9afb-470f-8bd6-79ac7b024fdc-c000.json.crc
preprocessed_test_tweets/._SUCCESS.crc
preprocessed_test_tweets/part-00000-6cea9b52-9afb-470f-8bd6-79ac7b024fdc-c000.json
preprocessed_test_tweets/_SUCCESS
preprocessed_test_tweets/.part-00001-6cea9b52-9afb-470f-8bd6-79ac7b024fdc-c000.json.crc
preprocessed_test_tweets/part-00001-6cea9b52-9afb-470f-8bd6-79ac7b024fdc-c000.json


**Topic modelling with grid search for the number of topics**

In [37]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import CountVectorizer

In [38]:
countVectorizer = CountVectorizer()
countVectorizer.setInputCol("tokens_lemma")
countVectorizer.setOutputCol("features")
vectorizerModel = countVectorizer.fit(tokenized_tweets)
wordsVector = vectorizerModel.transform(tokenized_tweets)

In [39]:
wordsVector.show(5)

+---+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| No|        id|            datetime|   query|           user|                text|      processed_text|              tokens|        final_tokens|        tokens_lemma|            features|
+---+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...| http twitpic com...|[http, twitpic, a...|[twitpic, awww, b...|[twitpic, awww, b...|(262144,[56,277,5...|
|  0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|s upset that he c...|[upset, that, upd...|[upset, update, f...|[upset, update, f...|(262144,[6,55,141...|
|  0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       ma

In [29]:
# num_topics=range(3,8)
# models=[]
# log_likeli=[]
# log_perp=[]
# for num in num_topics:
#   lda = LDA(k=num, maxIter=50)
#   ldaModel = lda.fit(wordsVector)
#   models.append(ldaModel)
#   ll = ldaModel.logLikelihood(wordsVector)
#   lp = ldaModel.logPerplexity(wordsVector)
#   log_likeli.append(ll)
#   log_perp.append(lp)

In [30]:
# import pandas as pd
# import matplotlib.pyplot as plt
# plot_data=pd.DataFrame(list(zip(num_topics,log_likeli,log_perp)),
#             columns=['topics_num','logLikelihood','logPerplexity'])    

# plot_data.plot(x='topics_num',y='logLikelihood',kind = 'line')
# plt.show()

# plot_data.plot(x='topics_num',y='logPerplexity',kind = 'line')
# plt.show()


In [40]:
lda = LDA(k=6, maxIter=100)
lda_model = lda.fit(wordsVector)

In [41]:
# extracting vocabulary from CountVectorizer
vocabulary = vectorizerModel.vocabulary

# create topics based on LDA
lda_topics = lda_model.describeTopics()
lda_topics.show()


+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[15, 0, 3, 56, 14...|[0.02218071747784...|
|    1|[1, 0, 3, 2, 7, 8...|[0.01212319713823...|
|    2|[12, 2, 13, 88, 2...|[0.04607247712742...|
|    3|[4897, 6422, 751,...|[9.97018451255729...|
|    4|[5, 228, 938, 19,...|[0.17156044268459...|
|    5|[10, 61, 25, 570,...|[0.26999900621882...|
+-----+--------------------+--------------------+





In [44]:
topics_words = lda_topics.rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocabulary[idx] for idx in idx_list])\
       .collect()

for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word, ',', end='')
    print('')
    print("*"*25)
    
transformed = lda_model.transform(wordsVector)
transformed.show(10)

topic: 0
*************************
feel ,good ,like ,twitpic ,make ,hurt ,today ,want ,morning ,rain ,
*************************
topic: 1
*************************
go ,good ,like ,work ,time ,think ,love ,know ,really ,today ,
*************************
topic: 2
*************************
miss ,work ,back ,bore ,home ,love ,today ,come ,school ,phone ,
*************************
topic: 3
*************************
mobypicture ,yesh ,tumblr ,mein ,merge ,orkut ,dolly ,mich ,nahi ,naruto ,
*************************
topic: 4
*************************
quot ,tinyurl ,hannah ,watch ,diversity ,montana ,free ,goodmorning ,angels ,google ,
*************************
topic: 5
*************************
thank ,follow ,much ,appreciate ,shout ,blip ,mention ,friday ,follower ,retweet ,
*************************
+---+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------

In [45]:
transformed = lda_model.transform(wordsVector)
transformed.show(10)

+---+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| No|        id|            datetime|   query|           user|                text|      processed_text|              tokens|        final_tokens|        tokens_lemma|            features|   topicDistribution|
+---+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...| http twitpic com...|[http, twitpic, a...|[twitpic, awww, b...|[twitpic, awww, b...|(262144,[56,277,5...|[0.13566244276979...|
|  0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|s upset that he c...|[upset, that, upd...|[upset, update, f...|[upset, update

In [42]:
lda_model.save("lda_model2")

In [43]:
!tar cvf lda_model2.tar lda_model2

lda_model2/
lda_model2/data/
lda_model2/data/._SUCCESS.crc
lda_model2/data/part-00000-79228ce9-77b6-49a0-ac51-3488e36c120a-c000.snappy.parquet
lda_model2/data/_SUCCESS
lda_model2/data/.part-00000-79228ce9-77b6-49a0-ac51-3488e36c120a-c000.snappy.parquet.crc
lda_model2/metadata/
lda_model2/metadata/._SUCCESS.crc
lda_model2/metadata/part-00000
lda_model2/metadata/_SUCCESS
lda_model2/metadata/.part-00000.crc


In [40]:
wordsVector.write.json("processed_data")

In [41]:
!tar cvf processed_data.tar processed_data

processed_data/
processed_data/part-00000-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json
processed_data/part-00001-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json
processed_data/.part-00000-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json.crc
processed_data/.part-00001-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json.crc
processed_data/._SUCCESS.crc
processed_data/_SUCCESS


In [49]:
!ls

drive		preprocessed_test_tweets      spark-3.2.1-bin-hadoop2.7
lda_model2	preprocessed_test_tweets.tar  spark-3.2.1-bin-hadoop2.7.tgz
lda_model2.tar	sample_data		      training.csv


In [10]:
from pyspark.ml.clustering import LocalLDAModel, LDAModel, LDA

In [21]:
!tar xvf processed_data.tar

processed_data/
processed_data/part-00000-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json
processed_data/part-00001-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json
processed_data/.part-00000-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json.crc
processed_data/.part-00001-b0ac3ea1-1b94-42e4-a463-cd009ba86bdc-c000.json.crc
processed_data/._SUCCESS.crc
processed_data/_SUCCESS


In [11]:
loaded_model = LocalLDAModel.load("lda_model")

In [22]:
loaded_df = spark.read.json('processed_data')

In [23]:
loaded_df.show(5)

+---+--------------------+--------------------+----------+--------------------+--------+--------------------+--------------------+--------------------+---------------+
| No|            datetime|            features|        id|      processed_text|   query|                text|              tokens|        tokens_lemma|           user|
+---+--------------------+--------------------+----------+--------------------+--------+--------------------+--------------------+--------------------+---------------+
|  0|Mon Apr 06 22:19:...|{[2, 6, 7, 17, 24...|1467810369| http twitpic com...|NO_QUERY|@switchfoot http:...|[http, twitpic, c...|[http, twitpic, c...|_TheSpecialOne_|
|  0|Mon Apr 06 22:19:...|{[1, 7, 13, 25, 1...|1467810672|s upset that he c...|NO_QUERY|is upset that he ...|[upset, that, can...|[upset, that, can...|  scotthamilton|
|  0|Mon Apr 06 22:19:...|{[0, 5, 21, 27, 2...|1467810917| I dived many tim...|NO_QUERY|@Kenichan I dived...|[dived, many, tim...|[dive, many, time...|       ma

In [24]:
test_df = loaded_df.sample(withReplacement=False, fraction=0.1)

In [25]:
test_df.write.json('test_df')

In [26]:
!tar cvf test_df.tar test_df

test_df/
test_df/.part-00003-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/.part-00005-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/part-00005-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/.part-00006-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/part-00001-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/._SUCCESS.crc
test_df/part-00000-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/part-00004-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/.part-00000-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/part-00003-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/.part-00001-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/part-00006-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/.part-00002-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/part-00002-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json
test_df/.part-00004-b23be4ea-b61d-4f52-b20e-cf5aa86ed581-c000.json.crc
test_df/_S

In [34]:
test_df.select(['features']).head(5)

[Row(features=Row(indices=[2061, 205972], size=262144, type=0, values=[1.0, 1.0])),
 Row(features=Row(indices=[1, 9, 16, 54, 71, 128, 271, 281, 2779], size=262144, type=0, values=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),
 Row(features=Row(indices=[0, 1, 5, 11, 15, 17, 20, 64, 147, 191, 243, 360, 593, 616, 1445], size=262144, type=0, values=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0])),
 Row(features=Row(indices=[7, 17, 18, 36, 73, 102, 110, 173, 314, 546, 578, 1446], size=262144, type=0, values=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0])),
 Row(features=Row(indices=[0, 4, 5, 40, 88, 132, 185, 351, 440, 506, 871, 13259], size=262144, type=0, values=[3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))]

In [29]:
test_prediction = loaded_model.transform(test_df)

IllegalArgumentException: ignored

In [27]:
test_prediction.select(['text', 'features', 'topicDistribution']).show(5)

NameError: ignored