In [10]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Normalizer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, Word2Vec
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, NaiveBayes, RandomForestClassifier
from pyspark.ml.clustering import LDA
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner, NGramGenerator, PerceptronModel
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = spark.read.csv("gs://msca-bdp-student-gcs/Group7_Final_Project/airline_reviews/airline_reviews_preprocessed.csv", 
                                     header = True, inferSchema = True)

                                                                                

In [3]:
df.count()

121296

In [4]:
df2 = df.where('overall_score is not null')
df3 = df2.where('review is not null')

In [5]:
df3.count()

                                                                                

116145

In [6]:
df3 = df3.withColumn('sentiment', lit(None))
df3 = df3.withColumn('sentiment', when((col('overall_score') >= 1) & (col('overall_score') <= 5), "Negative").otherwise(col('sentiment')))
df3 = df3.withColumn('sentiment', when((col('overall_score') >= 6) & (col('overall_score') <= 10), "Positive").otherwise(col('sentiment')))
sentiment_dist = df3.groupBy('sentiment').count()
sentiment_dist = sentiment_dist.withColumn('pct', col('count')/116145)
sentiment_dist.toPandas()

                                                                                

Unnamed: 0,sentiment,count,pct
0,Positive,48942,0.421387
1,Negative,67203,0.578613


In [7]:
df3[['review', 'sentiment']].toPandas()

                                                                                

Unnamed: 0,review,sentiment
0,The customer service rep I spoke to was incred...,Negative
1,Last year my family and I took a trip to NY on...,Positive
2,JFK-RDU: First time flying this carrier. They ...,Positive
3,All round good airline and nice flight. The se...,Positive
4,JFK-FLL roundtrip. FLL bound on E190. Spacious...,Positive
...,...,...
116140,Treviso to Lviv. Seemed like a new plane. Very...,Positive
116141,Rome to Prague. Was very happy with the flight...,Positive
116142,We often fly with Wizzair to/from Charleroi/Bu...,Positive
116143,PRG-LTN and LTN-PRG were rather good flights. ...,Negative


In [8]:
add_stopwords = [
"0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz",]


In [11]:
regexTokenizer = RegexTokenizer(inputCol = "review", outputCol = "words", pattern = "\\W")
stopwordsRemover = StopWordsRemover(inputCol = "words", outputCol = "filtered").setStopWords(add_stopwords)

pipeline1 = Pipeline(stages = [regexTokenizer, stopwordsRemover])

pipelineFit1 = pipeline1.fit(df3)
dataset1 = pipelineFit1.transform(df3)

countVectors = CountVectorizer(inputCol = "filtered", outputCol = "features", vocabSize = 30000, minDF = 5)
count_vector_model = countVectors.fit(dataset1)
dataset1_2 = count_vector_model.transform(dataset1)

                                                                                

In [12]:
lda = LDA(k = 10, seed = 123, optimizer = "em", featuresCol = "features")
ldamodel = lda.fit(dataset1_2)

                                                                                

In [14]:
vocab = count_vector_model.vocabulary

def get_words(token_list):
     return [vocab[token_id] for token_id in token_list]
       
udf_to_words = udf(get_words, ArrayType(StringType()))

num_top_words = 20

topics = ldamodel.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(col('termIndices')))

topics.select('topic', 'topicWords').show(truncate = 150)



+-----+---------------------------------------------------------------------------------------------------------------------------------------------+
|topic|                                                                                                                                   topicWords|
+-----+---------------------------------------------------------------------------------------------------------------------------------------------+
|    0| [flight, service, time, airline, good, staff, food, seats, flights, seat, hours, check, airport, plane, crew, airlines, fly, 2, cabin, told]|
|    1| [flight, service, time, airline, good, food, staff, seats, seat, flights, hours, plane, airport, crew, check, airlines, fly, cabin, 2, told]|
|    2|[flight, service, time, airline, good, food, seats, seat, staff, crew, flights, check, plane, hours, airport, airlines, cabin, class, fly, 2]|
|    3| [flight, service, time, airline, good, food, staff, seat, seats, flights, hours, crew, plane

                                                                                

In [15]:
documentAssembler = DocumentAssembler() \
     .setInputCol('review') \
     .setOutputCol('document')

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokenized')

normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemmatized')

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('unigrams') \
     .setStopWords(add_stopwords)

ngrammer = NGramGenerator() \
    .setInputCols(['lemmatized']) \
    .setOutputCol('ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

pos_tagger = PerceptronModel.pretrained('pos_anc') \
    .setInputCols(['document', 'lemmatized']) \
    .setOutputCol('pos')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]
pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[ | ]pos_anc download started this may take some time.
Approximate size to download 3.9 MB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]


In [16]:
finisher = Finisher() \
     .setInputCols(['unigrams', 'ngrams', 'pos'])

In [17]:
pipeline = Pipeline() \
     .setStages([documentAssembler,                  
                 tokenizer,
                 normalizer,                  
                 lemmatizer,                  
                 stopwords_cleaner, 
                 pos_tagger,
                 ngrammer,  
                 finisher])

In [18]:
df4 = pipeline.fit(df3).transform(df3)

In [19]:
df4.printSchema()

root
 |-- aircraft: string (nullable = true)
 |-- airline_name: string (nullable = true)
 |-- cabin_type: string (nullable = true)
 |-- date_flown: string (nullable = true)
 |-- date_pub: string (nullable = true)
 |-- entertainment_rating: integer (nullable = true)
 |-- food_rating: integer (nullable = true)
 |-- ground_service_rating: integer (nullable = true)
 |-- origin_country: string (nullable = true)
 |-- overall_score: integer (nullable = true)
 |-- recommended: string (nullable = true)
 |-- review: string (nullable = true)
 |-- route: string (nullable = true)
 |-- seat_comfort_rating: integer (nullable = true)
 |-- service_rating: integer (nullable = true)
 |-- slug: string (nullable = true)
 |-- title: string (nullable = true)
 |-- travel_type: string (nullable = true)
 |-- trip_verified: string (nullable = true)
 |-- value_rating: integer (nullable = true)
 |-- wifi_rating: integer (nullable = true)
 |-- unique_id: string (nullable = true)
 |-- day_pub: string (nullable = tru

In [20]:
df4[['review', 'finished_unigrams', 'finished_ngrams', 'finished_pos']].show()

[Stage 833:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+
|              review|   finished_unigrams|     finished_ngrams|        finished_pos|
+--------------------+--------------------+--------------------+--------------------+
|The customer serv...|[customer, servic...|[the, customer, s...|[DT, NN, NN, NN, ...|
|Last year my fami...|[year, family, tr...|[last, year, i, f...|[JJ, NN, NNP, NN,...|
|JFK-RDU: First ti...|[jfkrdu, time, fl...|[jfkrdu, first, t...|[NN, JJ, NN, NN, ...|
|All round good ai...|[round, good, air...|[all, round, good...|[DT, NN, JJ, NN, ...|
|JFK-FLL roundtrip...|[jfkfll, roundtri...|[jfkfll, roundtri...|[NN, NN, NN, NN, ...|
|ORD-BOS-SJU retur...|[ordbossju, retur...|[ordbossju, retur...|[NN, NN, NN, NN, ...|
|BWI-BOS-BWI on E1...|[bwibosbwi, servi...|[bwibosbwi, on, e...|[NN, IN, SYM, CC,...|
|Harassed by rude ...|[harass, rude, fl...|[harass, by, rude...|[NN, IN, NN, NN, ...|
|On November 24th ...|[november, arrive...|[on, novemb

                                                                                

In [21]:
processed_review = df4

In [22]:
from pyspark.sql import types as T

udf_join_arr = udf(lambda x: ' '.join(x), T.StringType())
processed_review  = processed_review.withColumn('finished_pos', udf_join_arr(col('finished_pos')))

In [23]:
pos_documentAssembler = DocumentAssembler() \
     .setInputCol('finished_pos') \
     .setOutputCol('pos_document')

In [24]:
pos_tokenizer = Tokenizer() \
     .setInputCols(['pos_document']) \
     .setOutputCol('pos')
     

In [25]:
pos_ngrammer = NGramGenerator() \
    .setInputCols(['pos']) \
    .setOutputCol('pos_ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

In [27]:
pos_finisher = Finisher() \
     .setInputCols(['pos', 'pos_ngrams'])

In [28]:
pos_pipeline = Pipeline() \
     .setStages([pos_documentAssembler,                  
                 pos_tokenizer,
                 pos_ngrammer,  
                 pos_finisher])

In [29]:
processed_review = pos_pipeline.fit(processed_review).transform(processed_review)

In [31]:
processed_review.printSchema()

root
 |-- aircraft: string (nullable = true)
 |-- airline_name: string (nullable = true)
 |-- cabin_type: string (nullable = true)
 |-- date_flown: string (nullable = true)
 |-- date_pub: string (nullable = true)
 |-- entertainment_rating: integer (nullable = true)
 |-- food_rating: integer (nullable = true)
 |-- ground_service_rating: integer (nullable = true)
 |-- origin_country: string (nullable = true)
 |-- overall_score: integer (nullable = true)
 |-- recommended: string (nullable = true)
 |-- review: string (nullable = true)
 |-- route: string (nullable = true)
 |-- seat_comfort_rating: integer (nullable = true)
 |-- service_rating: integer (nullable = true)
 |-- slug: string (nullable = true)
 |-- title: string (nullable = true)
 |-- travel_type: string (nullable = true)
 |-- trip_verified: string (nullable = true)
 |-- value_rating: integer (nullable = true)
 |-- wifi_rating: integer (nullable = true)
 |-- unique_id: string (nullable = true)
 |-- day_pub: string (nullable = tru

In [32]:
processed_review.select('finished_ngrams', 'finished_pos_ngrams').limit(5).show()



+--------------------+--------------------+
|     finished_ngrams| finished_pos_ngrams|
+--------------------+--------------------+
|[the, customer, s...|[DT, NN, NN, NN, ...|
|[last, year, i, f...|[JJ, NN, NNP, NN,...|
|[jfkrdu, first, t...|[NN, JJ, NN, NN, ...|
|[all, round, good...|[DT, NN, JJ, NN, ...|
|[jfkfll, roundtri...|[NN, NN, NN, NN, ...|
+--------------------+--------------------+



                                                                                

In [34]:
def filter_pos(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags) 
            if pos in ['JJ', 'NN', 'NNS', 'VB', 'VBP']]

udf_filter_pos = udf(filter_pos, T.ArrayType(T.StringType()))

In [35]:
processed_review = processed_review.withColumn('filtered_unigrams',
                                               udf_filter_pos(col('finished_unigrams'), 
                                                              col('finished_pos')))

In [36]:
processed_review.select('filtered_unigrams').limit(5).show(truncate = 90)



+------------------------------------------------------------------------------------------+
|                                                                         filtered_unigrams|
+------------------------------------------------------------------------------------------+
|[service, rep, speak, rude, employee, instantly, cancellation, website, middle, speak, ...|
|[year, family, jet, impressed, experience, entertainment, happy, busy, extremely, frien...|
|[jfkrdu, time, fly, carrier, terminal, security, time, min, friendly, situation, fine, ...|
|[good, airline, nice, service, great, colombia, orlando, orlando, lovely, great, happy,...|
|[jfkfll, roundtrip, fll, bind, shoulder, room, channel, friendly, helpful, return, flig...|
+------------------------------------------------------------------------------------------+



                                                                                

In [37]:
def filter_pos_combs(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags) 
            if (len(pos.split('_')) == 2 and \
                pos.split('_')[0] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                 pos.split('_')[1] in ['JJ', 'NN', 'NNS']) \
            or (len(pos.split('_')) == 3 and \
                pos.split('_')[0] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                 pos.split('_')[1] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                  pos.split('_')[2] in ['NN', 'NNS'])]
    
udf_filter_pos_combs = udf(filter_pos_combs, T.ArrayType(T.StringType()))

In [38]:
processed_review = processed_review.withColumn('filtered_ngrams',
                                               udf_filter_pos_combs(col('finished_ngrams'),
                                                                    col('finished_pos_ngrams')))

In [39]:
processed_review.select('filtered_ngrams').limit(5).show(truncate = 90)



+------------------------------------------------------------------------------------------+
|                                                                           filtered_ngrams|
+------------------------------------------------------------------------------------------+
|[customer_service, service_rep, be_terminate, didnt_understand, delta_website, be_utter...|
|[last_year, jet_blue, first_time, onboard_entertainment, child_happy, flight_crew, be_s...|
|[jfkrdu_first, first_time, time_fly, nice_terminal, security_line, line_take, long_time...|
|[round_good, good_airline, nice_flight, be_great, bogota_colombia, orlando_crew, great_...|
|[jfkfll_roundtrip, roundtrip_fll, fll_bind, spacious_leg, shoulder_room, room_direct, d...|
+------------------------------------------------------------------------------------------+



                                                                                

In [40]:
from pyspark.sql.functions import concat

processed_review = processed_review.withColumn('final', 
                                               concat(col('filtered_unigrams'), 
                                                      col('filtered_ngrams')))

In [42]:
tfizer = CountVectorizer(inputCol = 'final', outputCol = 'tf_features')
tf_model = tfizer.fit(processed_review)
tf_result = tf_model.transform(processed_review)

                                                                                

In [43]:
idfizer = IDF(inputCol = 'tf_features', outputCol = 'tf_idf_features')
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

23/11/24 05:04:18 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.3 MiB
                                                                                

In [48]:
num_topics = 5
max_iter = 25

lda = LDA(k = num_topics, maxIter = max_iter, featuresCol = 'tf_idf_features')
lda_model = lda.fit(tfidf_result)

23/11/24 06:18:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.3 MiB
23/11/24 06:25:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.3 MiB
23/11/24 06:25:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.4 MiB
23/11/24 06:25:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 28.4 MiB
23/11/24 06:25:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.4 MiB
23/11/24 06:25:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 28.4 MiB
23/11/24 06:25:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.4 MiB
23/11/24 06:25:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 28.4 MiB
23/11/24 06:25:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binar

In [49]:
vocab = tf_model.vocabulary

def get_words(token_list):
     return [vocab[token_id] for token_id in token_list]
       
udf_to_words = udf(get_words, ArrayType(StringType()))

num_top_words = 20

topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(col('termIndices')))

topics.select('topic', 'topicWords').show(truncate = 150)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic|                                                                                                                                            topicWords|
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------+
|    0|[hour, delay, flight, airline, customer_service, airport, time, bag, customer, day, check, fly, wait, plane, be_delay, pay, board, cancel, gate, book]|
|    1|[seat, business_class, class, good, food, economy, service, business, flight, cabin, fly, crew, time, staff, entertainment, airline, meal, be_good,...|
|    2|[flight, airline, customer, customer_service, air, book, service, refund, time, ticket, fly, hour, pay, day, seat, travel, cancel, bad, check, airp...|
|    3|[business_class, seat, business, class,