# Twitter Sentiment Analysis for the word Euro

Connect Spark to Mongo DB

In [151]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars "/usr/local/spark/jars/mongo-spark-connector_2.12-3.0.2.jar,/usr/local/spark/jars/mongo-java-driver-3.12.9.jar" pyspark-shell'

Note the inferschema is set to false, this makes spark read the entire database and not infer the values of fields from the first set of fields

In [152]:
from pyspark.sql import SparkSession
# fix read bug, basically turn off sampling
spark = SparkSession.builder.appName("TwitterMongo") \
.config("spark.mongodb.input.database", "mongodb://localhost:27017/twitter") \
.config("spark.mongodb.input.uri", "mongodb://localhost:27017/twitter.tweets") \
.config("spark.mongodb.read.sql.inferSchema.mapTypes.enabled", "FALSE") \
.config("spark.mongodb.output.uri","mongodb://localhost:27017/twitter.tweets").getOrCreate()

### Create the Session

And load all of the Twitter data in MongoDB

Print out the twitter tweet schema

In [153]:
# create a spark session
spark = SparkSession \
.builder \
.master("local") \
.appName("ABC") \
.config("spark.driver.memory", "15g") \
.config("spark.mongodb.read.connection.uri", "mongodb://localhost:27017/twitter") \
.config("spark.mongodb.write.connection.uri", "mongodb://localhost:27017/twitter") \
.config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:2.12-3.0.2') \
.getOrCreate()
# read data from mongodb collection "questions" into a dataframe "df"
df = spark.read \
.format("com.mongodb.spark.sql.DefaultSource") \
.option("uri", "mongodb://localhost:27017/twitter") \
.option("database", "twitter") \
.option("collection", "tweets") \
.load()
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- contributors: void (nullable = true)
 |-- coordinates: void (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: integer (containsNull = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (

In [154]:
df.show()

2023-05-14 20:39:32,700 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1325.3 KiB
[Stage 443:>                                                        (0 + 1) / 1]

+--------------------+------------+-----------+--------------------+------------------+--------------------+--------------------+--------------------+--------------+---------+------------+----+-------------------+-------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-----------+--------------------+-------------------+--------------------+-----------------------+-----------+-------------+---------+--------------------+--------------------+-------------------------------+-------------+---------+--------------------+---------------------+
|                 _id|contributors|coordinates|          created_at|display_text_range|            entities|   extended_entities|      extended_tweet|favorite_count|favorited|filter_level| geo|                 id|             id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_r

                                                                                

Create a spark object of the tweets held in the mongo db 

It is easier to use SQL statements and Pyspark to clean the data rather than writing queries in MongoDB

In [155]:
df.createOrReplaceTempView("tweets")

How many tweets in the DB all together

In [156]:
df = spark.sql("SELECT DISTINCT id FROM tweets")
df.count()

                                                                                

825219

How many tweets by language 


In [157]:
#pip install plotly
import pyspark.pandas as ps
import plotly
dfLang = spark.sql("SELECT DISTINCT lang, CAST(count(id) AS INT) as TweetCount FROM tweets GROUP BY lang \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLang.show()



+----+----------+
|lang|TweetCount|
+----+----------+
|  en|    256216|
|  es|    122069|
|  it|     75037|
|  tr|     72788|
|  fr|     69589|
|  de|     61316|
|  nl|     37689|
| und|     30173|
|  pt|     29393|
|  pl|     18116|
+----+----------+



                                                                                

In [158]:
tempdf = ps.DataFrame(dfLang)

tempdf.plot(kind='bar', x='lang', y='TweetCount')

2023-05-14 20:39:41,418 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:39:41,419 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:39:44,226 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:39:44,227 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

How many tweets by location

In [159]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount FROM tweets GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLoc.show()



+-----------------+----------+
|         Location|TweetCount|
+-----------------+----------+
|             null|    344908|
|İstanbul, Türkiye|      4911|
|           France|      4338|
|           Italia|      3980|
|           España|      3908|
|           London|      3496|
|          Türkiye|      3374|
|      Deutschland|      2791|
|    Paris, France|      2710|
|  London, England|      2690|
+-----------------+----------+



                                                                                

In [160]:
tempdf = ps.DataFrame(dfLoc)

tempdf.plot(kind='bar', x='Location', y='TweetCount')

2023-05-14 20:40:04,140 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:40:04,141 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:40:19,257 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:40:19,258 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Now limit the dataset to English texts and tweets with the Euro in the text

In [161]:
dfEnTwt = spark.sql("SELECT * FROM tweets WHERE lang = 'en' AND text LIKE '%euro%'")
dfEnTwt.createOrReplaceTempView("en_tweets")
dfEnTwt.show()

2023-05-14 20:40:22,155 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 1328.5 KiB
[Stage 464:>                                                        (0 + 1) / 1]

+--------------------+------------+-----------+--------------------+------------------+--------------------+--------------------+--------------------+--------------+---------+------------+----+-------------------+-------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-----------+--------------------+-------------------+--------------------+-----------------------+-----------+-------------+---------+--------------------+--------------------+--------------------+-------------+---------+--------------------+---------------------+
|                 _id|contributors|coordinates|          created_at|display_text_range|            entities|   extended_entities|      extended_tweet|favorite_count|favorited|filter_level| geo|                 id|             id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_reply_to_use

                                                                                

Now how many tweets in the English language dataset

In [162]:
dfEnTwt.count()

                                                                                

25818

In [163]:
dfLoc = spark.sql("SELECT DISTINCT user.location AS Location, CAST(count(id) AS INT) as TweetCount FROM en_tweets GROUP BY user.location \
                   ORDER BY TweetCount DESC LIMIT 10")
dfLoc.show()



+--------------------+----------+
|            Location|TweetCount|
+--------------------+----------+
|                null|     10214|
|               Spain|       413|
|     London, England|       178|
|      United Kingdom|       152|
|             Ireland|       140|
|       Costa del Sol|       124|
|       United States|       121|
|             she/her|       114|
|              London|       107|
|England, United K...|       106|
+--------------------+----------+



                                                                                

In [164]:
# select tweet id, geo, lang, quoted_status,quoted_status.geo 
# Having a look at some the data
dfOne = spark.sql("SELECT DISTINCT id,  text, quote_count, reply_count, retweet_count, favorite_count, geo, place, lang, \
                  quoted_status,quoted_status.geo, quoted_status.text, user.name, user.location   \
                  FROM en_tweets")

In [166]:

dfDay= spark.sql("SELECT DISTINCT CAST(substring(created_at, 27, 4) AS INT) as Year, \
          CAST(from_unixtime(unix_timestamp(substring(created_at, 5, 3), 'MMM'), 'MM') As INT) as Month, \
          CAST(substring(created_at, 9, 2) AS INT) as Day, \
          CAST(count(id) AS INT) as TweetCount \
          FROM en_tweets \
          WHERE lang = 'en' AND text LIKE '%euro%' GROUP BY substring(created_at, 27, 4), \
          substring(created_at, 5, 3), \
          substring(created_at, 9, 2)")

dfDay.createOrReplaceTempView("tweetsByDay")

dfDay = spark.sql("SELECT CONCAT(Year, '_', Month, '_', DAY) AS Date, TweetCount  FROM tweetsByDay ORDER BY Year, Month, Day")

dfDay.show()




+---------+----------+
|     Date|TweetCount|
+---------+----------+
| 2021_1_1|        24|
| 2021_1_2|        21|
| 2021_1_3|        24|
| 2021_1_4|        53|
| 2021_1_5|        24|
| 2021_1_6|        14|
|2021_1_26|        31|
|2021_1_27|        38|
|2021_1_28|        33|
|2021_1_29|        38|
|2021_1_30|        16|
|2021_1_31|        38|
| 2021_2_1|        23|
| 2021_2_2|        38|
| 2021_2_3|        34|
| 2021_2_4|        35|
| 2021_2_5|        25|
| 2021_2_6|        19|
| 2021_2_7|        32|
| 2021_2_8|        26|
+---------+----------+
only showing top 20 rows



                                                                                

In [167]:
tempdf = ps.DataFrame(dfDay)

tempdf.plot(kind='bar', x='Date', y='TweetCount')

2023-05-14 20:41:14,858 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:14,859 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:18,067 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:18,135 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:18,266 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:18,280 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

In [168]:
dfWeek = spark.sql("SELECT Year, weekofyear(make_date(Year, Month, Day)) as wkofYr , SUM(TweetCount) as TweetCountbyWeek \
                   FROM tweetsByDay \
                   GROUP BY Year, weekofyear(make_date(Year, Month, Day))")

dfWeek.createOrReplaceTempView("tweetsByWeek")
dfWeek = spark.sql("SELECT Year, wkofYr, CONCAT(Year, '_', wkofYr) AS yr_wk, TweetCountbyWeek  FROM tweetsByWeek ORDER BY Year, wkofYr")
dfWeek.show()



+----+------+-------+----------------+
|Year|wkofYr|  yr_wk|TweetCountbyWeek|
+----+------+-------+----------------+
|2021|     1| 2021_1|              91|
|2021|     4| 2021_4|             194|
|2021|     5| 2021_5|             206|
|2021|     6| 2021_6|             194|
|2021|     7| 2021_7|             212|
|2021|     8| 2021_8|             192|
|2021|     9| 2021_9|             137|
|2021|    10|2021_10|             166|
|2021|    11|2021_11|             165|
|2021|    12|2021_12|             212|
|2021|    13|2021_13|             169|
|2021|    14|2021_14|             138|
|2021|    17|2021_17|             128|
|2021|    18|2021_18|             225|
|2021|    19|2021_19|             145|
|2021|    20|2021_20|             264|
|2021|    21|2021_21|             297|
|2021|    22|2021_22|             221|
|2021|    23|2021_23|             486|
|2021|    24|2021_24|             437|
+----+------+-------+----------------+
only showing top 20 rows



                                                                                

In [169]:
tempdf = ps.DataFrame(dfWeek)

tempdf.plot(kind='bar', x='yr_wk', y='TweetCountbyWeek')

2023-05-14 20:41:21,830 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:21,831 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:25,302 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:25,340 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:25,370 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:25,421 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

In [170]:
# get the tweet count by month
dfMonth = spark.sql("SELECT Year, Month, SUM(TweetCount) as TweetCountbyMonth \
                     FROM tweetsByDay GROUP BY Year, Month")
dfMonth.createOrReplaceTempView("tweetsByMonth")
dfMonth = spark.sql("SELECT Year, Month, CONCAT(Year, '_', Month) AS MonthYr, TweetCountbyMonth  FROM tweetsByMonth ORDER BY Year, Month ")
dfMonth.show()



+----+-----+-------+-----------------+
|Year|Month|MonthYr|TweetCountbyMonth|
+----+-----+-------+-----------------+
|2021|    1| 2021_1|              354|
|2021|    2| 2021_2|              804|
|2021|    3| 2021_3|              756|
|2021|    4| 2021_4|              324|
|2021|    5| 2021_5|              995|
|2021|    6| 2021_6|             2354|
|2021|    7| 2021_7|             4098|
|2021|    8| 2021_8|              767|
|2021|    9| 2021_9|              806|
|2021|   10|2021_10|              711|
|2021|   11|2021_11|              471|
|2021|   12|2021_12|              831|
|2022|    1| 2022_1|              437|
|2022|    2| 2022_2|              373|
|2022|    3| 2022_3|             1495|
|2022|    4| 2022_4|             1438|
|2022|    5| 2022_5|             1375|
|2022|    6| 2022_6|             1163|
|2022|    7| 2022_7|             2405|
|2022|    8| 2022_8|              937|
+----+-----+-------+-----------------+
only showing top 20 rows



                                                                                

In [171]:
tempdf = ps.DataFrame(dfMonth)

tempdf.plot(kind='bar', x='MonthYr', y='TweetCountbyMonth')

2023-05-14 20:41:28,924 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:28,925 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:32,029 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:32,073 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:32,099 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-05-14 20:41:32,139 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

### Next up is text clean up 

In [221]:
#%pip install wordcloud
#%pip install vadersentiment
## sentiment analysis ref: https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/?ref=lbp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    print("Overall sentiment dictionary is : ", sentiment_dict)
    #print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    #print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    #print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
    #print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        print("Positive")
 
    elif sentiment_dict['compound'] <= - 0.05 :
       print("Negative")
 
    else :
        print("Neutral")
    
    return sentiment_dict['compound']

In [222]:
dfText = spark.sql("SELECT DISTINCT id, text AS text FROM en_tweets")
dfText.createOrReplaceTempView("text")
dfText.show()




+-------------------+--------------------+
|                 id|                text|
+-------------------+--------------------+
|1345422585002160149|I really like Wol...|
|1355814040275091467|RT @TheEconomist:...|
|1356840789104619521|Top story: Spotif...|
|1363083540854956034|RT @Letter_to_Jac...|
|1346785087887728640|It costs 0 $/rs/p...|
|1354474836723163139|RT @LauraHuhtasaa...|
|1354744404653858817|RT @MECenquiries:...|
|1359491081734389760|@nglinsman @amliv...|
|1359908297541885954|Daily Best &amp; ...|
|1361771357030252546|@gazzacritch86 We...|
|1362048634062307329|I actually feel b...|
|1362133958801039360|RT @ronniesfw: hi...|
|1355037259372883968|For Philly, euro ...|
|1356308640983089153|RT @PolaLem: Univ...|
|1362943363012509700|RT @SixersAdam: J...|
|1345412531263852545|A whooping 1,3 tr...|
|1355164539705290767|RT @stpaddyofassi...|
|1360073523746590723|RT @fab_moran: St...|
|1361644785501679619|RT @business: Air...|
|1362028937640042501|@euro_vieira You ...|
+----------

                                                                                

Count the number of words in the tweets 

In [223]:
from pyspark.sql.functions import * 
from pyspark.sql.types import StringType, ArrayType
# heavy reliance on SQL functions in the following code

dfWord = dfText.withColumn("Word", explode(split(col("text"), ' '))).groupBy("Word").count().orderBy(desc("count"))

dfWord.show()



+------------+-----+
|        Word|count|
+------------+-----+
|         the|15923|
|          RT|13707|
|        euro|13091|
|          to| 8480|
|           a| 7288|
|         and| 6068|
|          in| 6065|
|          of| 5711|
|         for| 5186|
|          is| 4801|
|          on| 3108|
|@vivo_europe| 2993|
|   @EURO2020| 2992|
|         you| 2651|
|           I| 2525|
|        that| 2101|
|         The| 2065|
|          at| 2047|
|          by| 1838|
|            | 1804|
+------------+-----+
only showing top 20 rows



                                                                                

Count the number of characters including spaces

In [224]:
dfChar = spark.sql("SELECT text, LENGTH(text) AS char FROM text ORDER BY char DESC")
dfChar.show(5)



+--------------------+----+
|                text|char|
+--------------------+----+
|RT @ChemSystemsCh...| 156|
|Internationals (w...| 155|
|RT @GA_Contest: $...| 152|
|RT @eurociu: Voca...| 152|
|RT @trade1311: pe...| 152|
+--------------------+----+
only showing top 5 rows



                                                                                

Check for special characters i.e. Hashtags

In [225]:
dfSpecChar = spark.sql("SELECT text, regexp_extract_all(text, '(#\\\\w+)', 1) AS Hashtags FROM text WHERE text like '%#%' ")
dfSpecChar.show()

[Stage 664:>                                                        (0 + 1) / 1]

+--------------------+--------------------+
|                text|            Hashtags|
+--------------------+--------------------+
|Daily Best &amp; ...|[#performers, #co...|
|Are you intereste...|[#euro, #microbio...|
|No reprieve for t...|  [#forex, #trading]|
|RT @KfW_int: Two-...|           [#Europe]|
|RT @eurovanya: Mo...|       [#Eurovision]|
|RT @scotlandcoerv...|         [#euro2020]|
|RT @nanon_diary: ...|[#MyPrecious, #my...|
|RT @EXOLAceTeamPH...|[#DONT_FIGHT_THE_...|
|RT @nexta_tv: 🇱?...|[#Lithuania, #Ukr...|
|@EURO2020 @vivo_e...|[#BTS, #Butter, #...|
|RT @EURO2020: For...|         [#EURO2020]|
|@WEURO2022 @euron...|        [#WEURO2022]|
|RT @euro_pinkploy...|  [#KAZZ2022xEUPINK]|
|RT @stasisnet: 💶...|       [#euro, #sta]|
|RT @stasisnet: 💶...|       [#euro, #sta]|
|RT @stasisnet: 💶...|       [#euro, #sta]|
|RT @stasisnet: 💶...|       [#euro, #sta]|
|RT @classic_jerse...|[#COYBIG, #ybig, ...|
|#UN had a vote on...|      [#UN, #racism]|
|RT @euroweeklynew...|        [#World

                                                                                

Check for upper case 

In [226]:
dfUpper = spark.sql("SELECT text FROM text WHERE translate(text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = ''")
dfUpper.show()




+----+
|text|
+----+
+----+



                                                                                

Check for numbers 



In [227]:
dfNum = spark.sql("SELECT text FROM text WHERE translate(text, '0123456789', '') <> text")
dfNum.show()



+--------------------+
|                text|
+--------------------+
|A whooping 1,3 tr...|
|@vileblackouts im...|
|Why have we not b...|
|I spend 15 euro's...|
|Woman In Portugal...|
|RT @euroweeklynew...|
|RT @WorldWideWob:...|
|RT @AntonSpisak: ...|
|I'm beginning to ...|
|“the bulk of trad...|
|@DataSciBurgoon I...|
|RT @WorldWideWob:...|
|RT @WorldWideWob:...|
|RT @WorldWideWob:...|
|RT @uk_domain_nam...|
|Bagley euro-step ...|
|@UxbEconomist07 @...|
|RT @europug2: Hug...|
|@agriclaudia Did ...|
|@DAVIDFARRANT8 @S...|
+--------------------+
only showing top 20 rows



                                                                                

This check does not really advance our understanding of the data, a lot of twitter names have numbers in them

In [228]:
dfNum.count()

                                                                                

18879

Leave only text in the strings, all non alpha numeric characters are removed with the application of the regular expression 

In [229]:

pattern = r'[^a-zA-Z0-9\s]'

dfText = spark.sql("SELECT id, text AS orignialText, text FROM text")
dfText = dfText.withColumn("text", regexp_replace('text', pattern, ''))
dfText.take(5)

                                                                                

[Row(id=1345422585002160149, orignialText="I really like Wolves this season, hoping they can push for euro's again", text='I really like Wolves this season hoping they can push for euros again'),
 Row(id=1355814040275091467, orignialText="RT @TheEconomist: The EU's €750bn recovery fund is being disbursed too slowly, so there is still too little stimulus in the euro area https…", text='RT TheEconomist The EUs 750bn recovery fund is being disbursed too slowly so there is still too little stimulus in the euro area https'),
 Row(id=1356840789104619521, orignialText='Top story: Spotify sube un euro el precio del plan familiar | Androidsis https://t.co/MvWqtmPC6W, see more https://t.co/DtoYNFGG07', text='Top story Spotify sube un euro el precio del plan familiar  Androidsis httpstcoMvWqtmPC6W see more httpstcoDtoYNFGG07'),
 Row(id=1346785087887728640, orignialText="It costs 0 $/rs/peso/euro/pound to admit that you're biased for a player. No one will send d you jail. Oh sorry my… https://t.co

In [230]:
dfText.createOrReplaceTempView("text") # this is cleaned txt
dfText = spark.sql("SELECT id, orignialText, LOWER(TRIM(text)) AS text FROM text")

dfText.take(5)

                                                                                

[Row(id=1345422585002160149, orignialText="I really like Wolves this season, hoping they can push for euro's again", text='i really like wolves this season hoping they can push for euros again'),
 Row(id=1355814040275091467, orignialText="RT @TheEconomist: The EU's €750bn recovery fund is being disbursed too slowly, so there is still too little stimulus in the euro area https…", text='rt theeconomist the eus 750bn recovery fund is being disbursed too slowly so there is still too little stimulus in the euro area https'),
 Row(id=1356840789104619521, orignialText='Top story: Spotify sube un euro el precio del plan familiar | Androidsis https://t.co/MvWqtmPC6W, see more https://t.co/DtoYNFGG07', text='top story spotify sube un euro el precio del plan familiar  androidsis httpstcomvwqtmpc6w see more httpstcodtoynfgg07'),
 Row(id=1346785087887728640, orignialText="It costs 0 $/rs/peso/euro/pound to admit that you're biased for a player. No one will send d you jail. Oh sorry my… https://t.co

In [231]:
#now build a word cloud
from wordcloud import WordCloud, STOPWORDS
dfText.createOrReplaceTempView("text") # this is the trimmed txt
dfText = spark.sql("SELECT id, orignialText, text FROM text")
dfText.show(5)



+-------------------+--------------------+--------------------+
|                 id|        orignialText|                text|
+-------------------+--------------------+--------------------+
|1345422585002160149|I really like Wol...|i really like wol...|
|1346785087887728640|It costs 0 $/rs/p...|it costs 0 rspeso...|
|1354474836723163139|RT @LauraHuhtasaa...|rt laurahuhtasaar...|
|1354744404653858817|RT @MECenquiries:...|rt mecenquiries s...|
|1355037259372883968|For Philly, euro ...|for philly euro i...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

Fix the spelling

In [232]:
#%pip install textblob
from textblob import TextBlob

dfText = spark.sql("SELECT id, orignialText, text FROM text")
#dfText = dfText.withColumn("newtext", col(TextBlob("text").correct()))

dfText.show(5)
dfText.createOrReplaceTempView("text") # this is the preprocessed SQL Style txt



+-------------------+--------------------+--------------------+
|                 id|        orignialText|                text|
+-------------------+--------------------+--------------------+
|1345422585002160149|I really like Wol...|i really like wol...|
|1355814040275091467|RT @TheEconomist:...|rt theeconomist t...|
|1356840789104619521|Top story: Spotif...|top story spotify...|
|1363083540854956034|RT @Letter_to_Jac...|rt lettertojack p...|
|1477800154367291392|RT @gemhostoffici...|rt gemhostofficia...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [233]:



dfText = spark.sql("SELECT * FROM text")

dfText.show(5)



+-------------------+--------------------+--------------------+
|                 id|        orignialText|                text|
+-------------------+--------------------+--------------------+
|1345422585002160149|I really like Wol...|i really like wol...|
|1346785087887728640|It costs 0 $/rs/p...|it costs 0 rspeso...|
|1354474836723163139|RT @LauraHuhtasaa...|rt laurahuhtasaar...|
|1345412531263852545|A whooping 1,3 tr...|a whooping 13 tri...|
|1345022138022064129|@engrare Don't fo...|engrare dont forg...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

### Tokenize and Stem the tweets

In [234]:
#%pip install nltk

In [235]:
# stackoverflow ref: https://stackoverflow.com/questions/53579444

from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from nltk.stem.snowball import SnowballStemmer

In [236]:
# Tokenize text
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
dfTextTok = tokenizer.transform(dfText).select("text","tokens")
dfText = dfText.join(dfTextTok, on=['text'], how='left_outer')
dfText.show(5)



+--------------------+-------------------+--------------------+--------------------+
|                text|                 id|        orignialText|              tokens|
+--------------------+-------------------+--------------------+--------------------+
|it costs 0 rspeso...|1346785087887728640|It costs 0 $/rs/p...|[it, costs, 0, rs...|
|rt laurahuhtasaar...|1354474836723163139|RT @LauraHuhtasaa...|[rt, laurahuhtasa...|
|engrare dont forg...|1345022138022064129|@engrare Don't fo...|[engrare, dont, f...|
|a whooping 13 tri...|1345412531263852545|A whooping 1,3 tr...|[a, whooping, 13,...|
|i really like wol...|1345422585002160149|I really like Wol...|[i, really, like,...|
+--------------------+-------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [237]:
# Remove stopwords
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
dfText = remover.transform(dfText).select("text","tokens","filtered")
dfText.show(5)



+--------------------+--------------------+--------------------+
|                text|              tokens|            filtered|
+--------------------+--------------------+--------------------+
|it costs 0 rspeso...|[it, costs, 0, rs...|[costs, 0, rspeso...|
|rt theeconomist t...|[rt, theeconomist...|[rt, theeconomist...|
|rt theeconomist t...|[rt, theeconomist...|[rt, theeconomist...|
|rt laurahuhtasaar...|[rt, laurahuhtasa...|[rt, laurahuhtasa...|
|rt lettertojack p...|[rt, lettertojack...|[rt, lettertojack...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

Now recheck for the most common words and decide if they need to be removed 

In [238]:
#dfWord = dfText.withColumn("Word", explode(split(col("filtered"), ' '))).groupBy("Word").count().orderBy(desc("count").limit(10))

#dfWord.show()

AnalysisException: cannot resolve 'split(filtered, ' ', -1)' due to data type mismatch: argument 1 requires string type, however, 'filtered' is of array<string> type.;
'Project [text#6752, tokens#6819, filtered#6901, explode(split(filtered#6901,  , -1)) AS Word#6932]
+- Project [text#6752, tokens#6819, filtered#6901]
   +- Project [text#6752, id#5061L, orignialText#6741, tokens#6819, UDF(tokens#6819) AS filtered#6901]
      +- Project [text#6752, id#5061L, orignialText#6741, tokens#6819]
         +- Join LeftOuter, (text#6752 = text#6867)
            :- Project [id#5061L, orignialText#6741, text#6752]
            :  +- SubqueryAlias text
            :     +- View (`text`, [id#5061L,orignialText#6741,text#6752])
            :        +- Project [id#5061L, orignialText#6741, text#6752]
            :           +- SubqueryAlias text
            :              +- View (`text`, [id#5061L,orignialText#6741,text#6752])
            :                 +- Project [id#5061L, orignialText#6741, lower(trim(text#6745, None)) AS text#6752]
            :                    +- SubqueryAlias text
            :                       +- View (`text`, [id#5061L,orignialText#6741,text#6745])
            :                          +- Project [id#5061L, orignialText#6741, regexp_replace(text#6643, [^a-zA-Z0-9\s], , 1) AS text#6745]
            :                             +- Project [id#5061L, text#6643 AS orignialText#6741, text#6643]
            :                                +- SubqueryAlias text
            :                                   +- View (`text`, [id#5061L,text#6643])
            :                                      +- Distinct
            :                                         +- Project [id#5061L, text#5082 AS text#6643]
            :                                            +- SubqueryAlias en_tweets
            :                                               +- View (`en_tweets`, [_id#5049,contributors#5050,coordinates#5051,created_at#5052,display_text_range#5053,entities#5054,extended_entities#5055,extended_tweet#5056,favorite_count#5057,favorited#5058,filter_level#5059,geo#5060,id#5061L,id_str#5062,in_reply_to_screen_name#5063,in_reply_to_status_id#5064L,in_reply_to_status_id_str#5065,in_reply_to_user_id#5066L,in_reply_to_user_id_str#5067,is_quote_status#5068,lang#5069,place#5070,possibly_sensitive#5071,quote_count#5072,quoted_status#5073,quoted_status_id#5074L,quoted_status_id_str#5075,quoted_status_permalink#5076,reply_count#5077,retweet_count#5078,retweeted#5079,retweeted_status#5080,source#5081,text#5082,timestamp_ms#5083,truncated#5084,user#5085,withheld_in_countries#5086])
            :                                                  +- Project [_id#5049, contributors#5050, coordinates#5051, created_at#5052, display_text_range#5053, entities#5054, extended_entities#5055, extended_tweet#5056, favorite_count#5057, favorited#5058, filter_level#5059, geo#5060, id#5061L, id_str#5062, in_reply_to_screen_name#5063, in_reply_to_status_id#5064L, in_reply_to_status_id_str#5065, in_reply_to_user_id#5066L, in_reply_to_user_id_str#5067, is_quote_status#5068, lang#5069, place#5070, possibly_sensitive#5071, quote_count#5072, ... 14 more fields]
            :                                                     +- Filter ((lang#5069 = en) AND text#5082 LIKE %euro%)
            :                                                        +- SubqueryAlias tweets
            :                                                           +- View (`tweets`, [_id#5049,contributors#5050,coordinates#5051,created_at#5052,display_text_range#5053,entities#5054,extended_entities#5055,extended_tweet#5056,favorite_count#5057,favorited#5058,filter_level#5059,geo#5060,id#5061L,id_str#5062,in_reply_to_screen_name#5063,in_reply_to_status_id#5064L,in_reply_to_status_id_str#5065,in_reply_to_user_id#5066L,in_reply_to_user_id_str#5067,is_quote_status#5068,lang#5069,place#5070,possibly_sensitive#5071,quote_count#5072,quoted_status#5073,quoted_status_id#5074L,quoted_status_id_str#5075,quoted_status_permalink#5076,reply_count#5077,retweet_count#5078,retweeted#5079,retweeted_status#5080,source#5081,text#5082,timestamp_ms#5083,truncated#5084,user#5085,withheld_in_countries#5086])
            :                                                              +- Relation [_id#5049,contributors#5050,coordinates#5051,created_at#5052,display_text_range#5053,entities#5054,extended_entities#5055,extended_tweet#5056,favorite_count#5057,favorited#5058,filter_level#5059,geo#5060,id#5061L,id_str#5062,in_reply_to_screen_name#5063,in_reply_to_status_id#5064L,in_reply_to_status_id_str#5065,in_reply_to_user_id#5066L,in_reply_to_user_id_str#5067,is_quote_status#5068,lang#5069,place#5070,possibly_sensitive#5071,quote_count#5072,... 14 more fields] MongoRelation(MongoRDD[1118] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status,StructType(StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true)),true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(quoted_status_permalink,StructType(StructField(url,StringType,true), StructField(expanded,StringType,true), StructField(display,StringType,true)),true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(retweeted_status,StructType(StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status,StructType(StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true)),true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(quoted_status_permalink,StructType(StructField(url,StringType,true), StructField(expanded,StringType,true), StructField(display,StringType,true)),true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true), StructField(withheld_in_countries,ArrayType(StringType,true),true)),true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(timestamp_ms,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true), StructField(withheld_in_countries,ArrayType(StringType,true),true))))
            +- Project [text#6867, tokens#6819]
               +- Project [id#6841L, orignialText#6741, text#6867, UDF(text#6867) AS tokens#6819]
                  +- Project [id#6841L, orignialText#6741, text#6867]
                     +- SubqueryAlias text
                        +- View (`text`, [id#6841L,orignialText#6741,text#6867])
                           +- Project [id#6841L, orignialText#6741, text#6867]
                              +- SubqueryAlias text
                                 +- View (`text`, [id#6841L,orignialText#6741,text#6867])
                                    +- Project [id#6841L, orignialText#6741, lower(trim(text#6745, None)) AS text#6867]
                                       +- SubqueryAlias text
                                          +- View (`text`, [id#6841L,orignialText#6741,text#6745])
                                             +- Project [id#6841L, orignialText#6741, regexp_replace(text#6643, [^a-zA-Z0-9\s], , 1) AS text#6745]
                                                +- Project [id#6841L, text#6643 AS orignialText#6741, text#6643]
                                                   +- SubqueryAlias text
                                                      +- View (`text`, [id#6841L,text#6643])
                                                         +- Distinct
                                                            +- Project [id#6841L, text#6862 AS text#6643]
                                                               +- SubqueryAlias en_tweets
                                                                  +- View (`en_tweets`, [_id#6829,contributors#6830,coordinates#6831,created_at#6832,display_text_range#6833,entities#6834,extended_entities#6835,extended_tweet#6836,favorite_count#6837,favorited#6838,filter_level#6839,geo#6840,id#6841L,id_str#6842,in_reply_to_screen_name#6843,in_reply_to_status_id#6844L,in_reply_to_status_id_str#6845,in_reply_to_user_id#6846L,in_reply_to_user_id_str#6847,is_quote_status#6848,lang#6849,place#6850,possibly_sensitive#6851,quote_count#6852,quoted_status#6853,quoted_status_id#6854L,quoted_status_id_str#6855,quoted_status_permalink#6856,reply_count#6857,retweet_count#6858,retweeted#6859,retweeted_status#6860,source#6861,text#6862,timestamp_ms#6863,truncated#6864,user#6865,withheld_in_countries#6866])
                                                                     +- Project [_id#6829, contributors#6830, coordinates#6831, created_at#6832, display_text_range#6833, entities#6834, extended_entities#6835, extended_tweet#6836, favorite_count#6837, favorited#6838, filter_level#6839, geo#6840, id#6841L, id_str#6842, in_reply_to_screen_name#6843, in_reply_to_status_id#6844L, in_reply_to_status_id_str#6845, in_reply_to_user_id#6846L, in_reply_to_user_id_str#6847, is_quote_status#6848, lang#6849, place#6850, possibly_sensitive#6851, quote_count#6852, ... 14 more fields]
                                                                        +- Filter ((lang#6849 = en) AND text#6862 LIKE %euro%)
                                                                           +- SubqueryAlias tweets
                                                                              +- View (`tweets`, [_id#6829,contributors#6830,coordinates#6831,created_at#6832,display_text_range#6833,entities#6834,extended_entities#6835,extended_tweet#6836,favorite_count#6837,favorited#6838,filter_level#6839,geo#6840,id#6841L,id_str#6842,in_reply_to_screen_name#6843,in_reply_to_status_id#6844L,in_reply_to_status_id_str#6845,in_reply_to_user_id#6846L,in_reply_to_user_id_str#6847,is_quote_status#6848,lang#6849,place#6850,possibly_sensitive#6851,quote_count#6852,quoted_status#6853,quoted_status_id#6854L,quoted_status_id_str#6855,quoted_status_permalink#6856,reply_count#6857,retweet_count#6858,retweeted#6859,retweeted_status#6860,source#6861,text#6862,timestamp_ms#6863,truncated#6864,user#6865,withheld_in_countries#6866])
                                                                                 +- Relation [_id#6829,contributors#6830,coordinates#6831,created_at#6832,display_text_range#6833,entities#6834,extended_entities#6835,extended_tweet#6836,favorite_count#6837,favorited#6838,filter_level#6839,geo#6840,id#6841L,id_str#6842,in_reply_to_screen_name#6843,in_reply_to_status_id#6844L,in_reply_to_status_id_str#6845,in_reply_to_user_id#6846L,in_reply_to_user_id_str#6847,is_quote_status#6848,lang#6849,place#6850,possibly_sensitive#6851,quote_count#6852,... 14 more fields] MongoRelation(MongoRDD[1118] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status,StructType(StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true)),true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(quoted_status_permalink,StructType(StructField(url,StringType,true), StructField(expanded,StringType,true), StructField(display,StringType,true)),true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(retweeted_status,StructType(StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(description,StringType,true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status,StructType(StructField(contributors,NullType,true), StructField(coordinates,NullType,true), StructField(created_at,StringType,true), StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(extended_tweet,StructType(StructField(display_text_range,ArrayType(IntegerType,true),true), StructField(entities,StructType(StructField(hashtags,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true), StructField(symbols,ArrayType(StructType(StructField(text,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(urls,ArrayType(StructType(StructField(url,StringType,true), StructField(expanded_url,StringType,true), StructField(display_url,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true), StructField(user_mentions,ArrayType(StructType(StructField(screen_name,StringType,true), StructField(name,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true)),true),true)),true), StructField(extended_entities,StructType(StructField(media,ArrayType(StructType(StructField(additional_media_info,StructType(StructField(description,StringType,true), StructField(embeddable,BooleanType,true), StructField(monetizable,BooleanType,true), StructField(title,StringType,true)),true), StructField(display_url,StringType,true), StructField(expanded_url,StringType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(indices,ArrayType(IntegerType,true),true), StructField(media_url,StringType,true), StructField(media_url_https,StringType,true), StructField(sizes,StructType(StructField(large,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(medium,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(small,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true), StructField(thumb,StructType(StructField(w,IntegerType,true), StructField(h,IntegerType,true), StructField(resize,StringType,true)),true)),true), StructField(source_status_id,LongType,true), StructField(source_status_id_str,StringType,true), StructField(source_user_id,LongType,true), StructField(source_user_id_str,StringType,true), StructField(type,StringType,true), StructField(url,StringType,true), StructField(video_info,StructType(StructField(aspect_ratio,ArrayType(IntegerType,true),true), StructField(duration_millis,IntegerType,true), StructField(variants,ArrayType(StructType(StructField(bitrate,IntegerType,true), StructField(content_type,StringType,true), StructField(url,StringType,true)),true),true)),true)),true),true)),true), StructField(full_text,StringType,true)),true), StructField(favorite_count,IntegerType,true), StructField(favorited,BooleanType,true), StructField(filter_level,StringType,true), StructField(geo,NullType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(in_reply_to_screen_name,StringType,true), StructField(in_reply_to_status_id,LongType,true), StructField(in_reply_to_status_id_str,StringType,true), StructField(in_reply_to_user_id,LongType,true), StructField(in_reply_to_user_id_str,StringType,true), StructField(is_quote_status,BooleanType,true), StructField(lang,StringType,true), StructField(place,StructType(StructField(id,StringType,true), StructField(url,StringType,true), StructField(place_type,StringType,true), StructField(name,StringType,true), StructField(full_name,StringType,true), StructField(country_code,StringType,true), StructField(country,StringType,true), StructField(bounding_box,StructType(StructField(type,StringType,true), StructField(coordinates,ArrayType(ArrayType(ArrayType(DoubleType,true),true),true),true)),true)),true), StructField(possibly_sensitive,BooleanType,true), StructField(quote_count,IntegerType,true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true)),true), StructField(quoted_status_id,LongType,true), StructField(quoted_status_id_str,StringType,true), StructField(quoted_status_permalink,StructType(StructField(url,StringType,true), StructField(expanded,StringType,true), StructField(display,StringType,true)),true), StructField(reply_count,IntegerType,true), StructField(retweet_count,IntegerType,true), StructField(retweeted,BooleanType,true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true), StructField(withheld_in_countries,ArrayType(StringType,true),true)),true), StructField(source,StringType,true), StructField(text,StringType,true), StructField(timestamp_ms,StringType,true), StructField(truncated,BooleanType,true), StructField(user,StructType(StructField(contributors_enabled,BooleanType,true), StructField(created_at,StringType,true), StructField(default_profile,BooleanType,true), StructField(default_profile_image,BooleanType,true), StructField(description,StringType,true), StructField(favourites_count,IntegerType,true), StructField(follow_request_sent,NullType,true), StructField(followers_count,IntegerType,true), StructField(following,NullType,true), StructField(friends_count,IntegerType,true), StructField(geo_enabled,BooleanType,true), StructField(id,LongType,true), StructField(id_str,StringType,true), StructField(is_translator,BooleanType,true), StructField(lang,NullType,true), StructField(listed_count,IntegerType,true), StructField(location,StringType,true), StructField(name,StringType,true), StructField(notifications,NullType,true), StructField(profile_background_color,StringType,true), StructField(profile_background_image_url,StringType,true), StructField(profile_background_image_url_https,StringType,true), StructField(profile_background_tile,BooleanType,true), StructField(profile_banner_url,StringType,true), StructField(profile_image_url,StringType,true), StructField(profile_image_url_https,StringType,true), StructField(profile_link_color,StringType,true), StructField(profile_sidebar_border_color,StringType,true), StructField(profile_sidebar_fill_color,StringType,true), StructField(profile_text_color,StringType,true), StructField(profile_use_background_image,BooleanType,true), StructField(protected,BooleanType,true), StructField(screen_name,StringType,true), StructField(statuses_count,IntegerType,true), StructField(time_zone,NullType,true), StructField(translator_type,StringType,true), StructField(url,StringType,true), StructField(utc_offset,NullType,true), StructField(verified,BooleanType,true)),true), StructField(withheld_in_countries,ArrayType(StringType,true),true))))


Now look at rare words 

In [239]:
#dfWord = dfText.withColumn("Word", explode(split(col("text"), ' '))).groupBy("Word").count().orderBy(asc("count").limit(10))

#dfWord.show()

TypeError: 'Column' object is not callable

Decided to stem the words as per this page https://stackoverflow.com/questions/53579444

In [240]:
# stem the words
stemmer = SnowballStemmer(language='english')
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
dfText = dfText.withColumn("filtered_stemmed", stemmer_udf("filtered"))
dfText.show(5)

                                                                                

+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|            filtered|    filtered_stemmed|
+--------------------+--------------------+--------------------+--------------------+
|it costs 0 rspeso...|[it, costs, 0, rs...|[costs, 0, rspeso...|[cost, 0, rspesoe...|
|rt theeconomist t...|[rt, theeconomist...|[rt, theeconomist...|[rt, theeconomist...|
|rt theeconomist t...|[rt, theeconomist...|[rt, theeconomist...|[rt, theeconomist...|
|rt laurahuhtasaar...|[rt, laurahuhtasa...|[rt, laurahuhtasa...|[rt, laurahuhtasa...|
|rt lettertojack p...|[rt, lettertojack...|[rt, lettertojack...|[rt, lettertojack...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [241]:
# Filter out short words
filterShortWords = udf(lambda row: [x for x in row if len(x) >= 4], ArrayType(StringType()))
dfText = dfText.withColumn("filtered_stemmed", filterShortWords("filtered_stemmed"))

dfText.show(5)


                                                                                

+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|            filtered|    filtered_stemmed|
+--------------------+--------------------+--------------------+--------------------+
|rt gemhostofficia...|[rt, gemhostoffic...|[rt, gemhostoffic...|[gemhostoffici, 3...|
|rt gemhostofficia...|[rt, gemhostoffic...|[rt, gemhostoffic...|[gemhostoffici, 3...|
|rt gemhostofficia...|[rt, gemhostoffic...|[rt, gemhostoffic...|[gemhostoffici, 3...|
|rt gemhostofficia...|[rt, gemhostoffic...|[rt, gemhostoffic...|[gemhostoffici, 3...|
|rt gemhostofficia...|[rt, gemhostoffic...|[rt, gemhostoffic...|[gemhostoffici, 3...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [242]:
dfText.count()


                                                                                

264598

### Advanced Text processing

N-grams

In [243]:
from pyspark.ml.feature import NGram, VectorAssembler

# Create 2-grams
ngram = NGram(n=2, inputCol="filtered_stemmed", outputCol="ngrams")
dfText2 = ngram.transform(dfText).select("text","filtered_stemmed","ngrams")
dfText2.show(5)
dfText4 = ngram.setParams(n=4).transform(dfText).head()



                                                                                

+--------------------+--------------------+--------------------+
|                text|    filtered_stemmed|              ngrams|
+--------------------+--------------------+--------------------+
|it costs 0 rspeso...|[cost, rspesoeuro...|[cost rspesoeurop...|
|rt theeconomist t...|[theeconomist, 75...|[theeconomist 750...|
|rt theeconomist t...|[theeconomist, 75...|[theeconomist 750...|
|rt laurahuhtasaar...|[laurahuhtasaari,...|[laurahuhtasaari ...|
|rt lettertojack p...|[lettertojack, po...|[lettertojack por...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

Term Frequency

In [245]:
from pyspark.ml.feature import HashingTF, IDF

hashtf = HashingTF(numFeatures=2**16, inputCol="text", outputCol='tf')
hashtf.setNumFeatures(2**16).transform(dfText4).head()

AttributeError: _jdf

Inverse Document Frequency

In [None]:
from pyspark.ml.linalg import DenseVector
idf = IDF(minDocFreq=3)
idf.setInputCol("tf")
idf.setOutputCol("idf")

model = idf.fit(dfText4)
model.setOutputCol("idf")

## Building the model 

In [None]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
(train_set, val_set, test_set) = dfText.randomSplit([0.98, 0.01, 0.01], seed = 22230)