In [92]:
import pandas as pd
pd.options.display.max_rows = 150

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import string

In [93]:
data = pd.read_pickle('cleanData.pkl')
data

Unnamed: 0,Cabin Flown,Date Flown,Recommended,score,text,origin,destin,Country
0,Economy,October 2015,no,2,We flew with Germanwings (or tried to) on Sept...,Stuttgart,Amsterdam,Canada
1,Economy,October 2015,no,3,I am less than impressed with Germanwings serv...,Hamburg,London,United Kingdom
2,Economy,September 2015,no,3,Flew from Palma de Mallorca to Cologne with Ge...,PMI,CGN,Germany
3,,September 2015,yes,10,Good flight from Berlin-Tegel to London Heathr...,Berlin-Tegel,London,Germany
4,Economy,September 2015,no,4,I don't get why Germanwings is always late and...,CGN,LHR,Germany
5,Economy,September 2015,no,1,We were delayed 5 hours in Istanbul and receiv...,Istanbul,London,United Kingdom
6,Economy,September 2015,yes,7,Easy Check-In at Düsseldorf. The Germanwings f...,Düsseldorf,London,Germany
7,Economy,September 2015,no,2,Checked in at Dusseldorf for our Germanwings f...,Dusseldorf,Heathrow,United Kingdom
8,Economy,August 2015,no,2,This Germanwings flight should take off as sch...,Duesseldorf,Heathrow,Gibraltar
9,Economy,August 2015,no,5,Germanwings is a very mediocre airline. Compar...,Cologne,Palma,Australia


In [94]:
sentimentAnalyzer = SentimentIntensityAnalyzer()

In [95]:
data['score']=list(map(lambda x:sentimentAnalyzer.polarity_scores(x),data.text))
data['sentiment_com']= list(map(lambda x:x['compound'],data.score))
data['sentiment_pos'] = list(map(lambda x:x['pos'],data.score))
data['sentiment_neg']= list(map(lambda x:x['neg'],data.score))
data['sentiment_neu']= list(map(lambda x:x['neu'],data.score))
data = data.drop(columns="score")

In [96]:
data.head()

Unnamed: 0,Cabin Flown,Date Flown,Recommended,text,origin,destin,Country,sentiment_com,sentiment_pos,sentiment_neg,sentiment_neu
0,Economy,October 2015,no,We flew with Germanwings (or tried to) on Sept...,Stuttgart,Amsterdam,Canada,-0.6492,0.0,0.035,0.965
1,Economy,October 2015,no,I am less than impressed with Germanwings serv...,Hamburg,London,United Kingdom,-0.2173,0.074,0.106,0.82
2,Economy,September 2015,no,Flew from Palma de Mallorca to Cologne with Ge...,PMI,CGN,Germany,0.9037,0.146,0.052,0.802
3,,September 2015,yes,Good flight from Berlin-Tegel to London Heathr...,Berlin-Tegel,London,Germany,0.301,0.206,0.203,0.591
4,Economy,September 2015,no,I don't get why Germanwings is always late and...,CGN,LHR,Germany,-0.2975,0.048,0.07,0.882


In [97]:
industry_specific_stopwords = ["cologne","gw","lh","flight", "germanwings", "from", "to", "airbus", "th", "st", "nd", "rd", "lufthansa"]

def cleanText (text, origin, destin):
    result = []
    splitText = text.split(".")
    
    for r in splitText:    
        r = ''.join([i for i in r if not i.isdigit()])
        r = r.replace(origin, '')
        r = r.replace(destin, '')
        tokens = word_tokenize(r)
        tags = pos_tag(tokens)
        nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP')]
        resultwords  = [word for word in nouns if word.lower().translate(str.maketrans({key: None for key in string.punctuation})) not in industry_specific_stopwords]
        result.append(resultwords)
    
    return result
    
    
data['cleanText']=list(map(lambda x,y,z:cleanText(x,y,z),data.text, data.origin, data.destin))

In [98]:
data.head()

Unnamed: 0,Cabin Flown,Date Flown,Recommended,text,origin,destin,Country,sentiment_com,sentiment_pos,sentiment_neg,sentiment_neu,cleanText
0,Economy,October 2015,no,We flew with Germanwings (or tried to) on Sept...,Stuttgart,Amsterdam,Canada,-0.6492,0.0,0.035,0.965,"[[September], [counter, information], [counter..."
1,Economy,October 2015,no,I am less than impressed with Germanwings serv...,Hamburg,London,United Kingdom,-0.2173,0.074,0.106,0.82,"[[service], [connection, stewardess, water, ev..."
2,Economy,September 2015,no,Flew from Palma de Mallorca to Cologne with Ge...,PMI,CGN,Germany,0.9037,0.146,0.052,0.802,"[[Flew, Palma, Mallorca], [Check-in, row], [pl..."
3,,September 2015,yes,Good flight from Berlin-Tegel to London Heathr...,Berlin-Tegel,London,Germany,0.301,0.206,0.203,0.591,"[[Heathrow, staff], [Delay, Berlin, behaf], [l..."
4,Economy,September 2015,no,I don't get why Germanwings is always late and...,CGN,LHR,Germany,-0.2975,0.048,0.07,0.882,"[[service], [ground, staff, departure, arrival..."


In [99]:
def concatLists(x):
    return [item for sublist in x for item in sublist]

sentence_documents = [item for sublist in data['cleanText'].tolist() for item in sublist]
review_documents = [concatLists(x) for x in data['cleanText'].tolist()]

In [100]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def nlp_clean(document):
    stop_free = [i for i in document if i.lower() not in stop]
    punc_free = [ch for ch in stop_free if ch not in exclude]
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free)
    return normalized

In [101]:
DOCUMENTS = review_documents 

In [102]:
doc_c = [nlp_clean(doc) for doc in DOCUMENTS]   
doc_clean = [doc.split() for doc in doc_c]

In [103]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.

dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [104]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
# ldamodel = Lda(doc_term_matrix, num_topics=8, id2word = dictionary, passes=100)

In [16]:
# ldamodel.print_topics(num_topics=25, num_words=3)

[(0, '0.020*"price" + 0.013*"snack" + 0.013*"business"'),
 (1, '0.026*"time" + 0.025*"service" + 0.016*"crew"'),
 (2, '0.031*"plane" + 0.023*"luggage" + 0.023*"time"'),
 (3, '0.026*"service" + 0.023*"food" + 0.017*"airport"'),
 (4, '0.028*"time" + 0.016*"return" + 0.015*"seat"'),
 (5, '0.024*"time" + 0.019*"staff" + 0.014*"return"'),
 (6, '0.030*"luggage" + 0.017*"ticket" + 0.017*"time"'),
 (7, '0.027*"time" + 0.022*"food" + 0.022*"airline"')]

In [20]:
# ldamodel.save("mymodel")

In [105]:
Lda = gensim.models.ldamodel.LdaModel

x = Lda.load("ldamodels/mymodel")

In [106]:
x.print_topics(num_topics=25, num_words=3)

[(0, '0.020*"price" + 0.013*"snack" + 0.013*"business"'),
 (1, '0.026*"time" + 0.025*"service" + 0.016*"crew"'),
 (2, '0.031*"plane" + 0.023*"luggage" + 0.023*"time"'),
 (3, '0.026*"service" + 0.023*"food" + 0.017*"airport"'),
 (4, '0.028*"time" + 0.016*"return" + 0.015*"seat"'),
 (5, '0.024*"time" + 0.019*"staff" + 0.014*"return"'),
 (6, '0.030*"luggage" + 0.017*"ticket" + 0.017*"time"'),
 (7, '0.027*"time" + 0.022*"food" + 0.022*"airline"')]

In [107]:
topicTitles = []
placeholder = []
assigned = []

def getKeyScore(i):
    return i[1][0][0]

for i in x.print_topics(num_topics=25, num_words=5):
    placeholder.append((i[0],[(j.split('*')[0].strip(),j.split('*')[1].strip().replace('"','')) for j in i[1].split("+")]))


s_placeholder = sorted(placeholder, key = getKeyScore, reverse = True)

while len(topicTitles) < 8:
    for i in s_placeholder:
        if i[1][0][1] not in assigned:
            topicTitles.append((i[0],i[1][0][1]))      
            assigned.append(i[1][0][1])
        else:
            i[1].pop(0)
            topicTitles = []
            assigned = []
            break

    s_placeholder = sorted(s_placeholder, key = getKeyScore, reverse = True)


In [108]:
topicTitles

[(2, 'plane'),
 (6, 'luggage'),
 (4, 'time'),
 (3, 'service'),
 (7, 'food'),
 (0, 'price'),
 (5, 'staff'),
 (1, 'crew')]

In [109]:
assigmentLists = {}
for topic in topicTitles:
    assigmentLists[topic[0]] = []

assigmentLists

{2: [], 6: [], 4: [], 3: [], 7: [], 0: [], 5: [], 1: []}

In [110]:
for idx,doc_i in enumerate(doc_clean):
    score = x.get_document_topics(dictionary.doc2bow(doc_i))
    nums = [k for k in range(8)]
    for element in score:
        if element[1]>0.2:
            nums.remove(element[0])
            assigmentLists[element[0]].append(1)
    for n in nums:
        assigmentLists[n].append(0)

In [111]:
for topic in topicTitles:
    data["topic_"+topic[1]]= assigmentLists[topic[0]]

In [112]:
data.head()

Unnamed: 0,Cabin Flown,Date Flown,Recommended,text,origin,destin,Country,sentiment_com,sentiment_pos,sentiment_neg,sentiment_neu,cleanText,topic_plane,topic_luggage,topic_time,topic_service,topic_food,topic_price,topic_staff,topic_crew
0,Economy,October 2015,no,We flew with Germanwings (or tried to) on Sept...,Stuttgart,Amsterdam,Canada,-0.6492,0.0,0.035,0.965,"[[September], [counter, information], [counter...",0,0,1,0,0,0,0,0
1,Economy,October 2015,no,I am less than impressed with Germanwings serv...,Hamburg,London,United Kingdom,-0.2173,0.074,0.106,0.82,"[[service], [connection, stewardess, water, ev...",1,0,0,0,0,0,0,0
2,Economy,September 2015,no,Flew from Palma de Mallorca to Cologne with Ge...,PMI,CGN,Germany,0.9037,0.146,0.052,0.802,"[[Flew, Palma, Mallorca], [Check-in, row], [pl...",0,0,0,0,0,0,0,1
3,,September 2015,yes,Good flight from Berlin-Tegel to London Heathr...,Berlin-Tegel,London,Germany,0.301,0.206,0.203,0.591,"[[Heathrow, staff], [Delay, Berlin, behaf], [l...",0,0,0,0,1,0,0,0
4,Economy,September 2015,no,I don't get why Germanwings is always late and...,CGN,LHR,Germany,-0.2975,0.048,0.07,0.882,"[[service], [ground, staff, departure, arrival...",0,0,0,0,0,0,0,1
