In [12]:
import pandas as pd
import numpy as np
import nltk
import re

In [14]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/prakharjain/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/prakharjain/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/prakharjain/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/prakharjain/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/prakharjain/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downl

True

Now we will import our dataset of airline reviews which is taken from [here](https://www.kaggle.com/datasets/kanchana1990/singapore-airlines-reviews?resource=download)

In [16]:
df = pd.read_csv("singapore_airlines_reviews.csv" , sep=",",on_bad_lines='warn', header=0,usecols=["rating","text"])


Lets check if the data is loaded properly

In [17]:
df.head(5)

Unnamed: 0,rating,text
0,3,We used this airline to go from Singapore to L...
1,5,The service on Singapore Airlines Suites Class...
2,1,"Booked, paid and received email confirmation f..."
3,5,"Best airline in the world, seats, food, servic..."
4,2,Premium Economy Seating on Singapore Airlines ...


In [18]:
df = df.dropna()
df = df.reset_index(drop=True)
df

Unnamed: 0,rating,text
0,3,We used this airline to go from Singapore to L...
1,5,The service on Singapore Airlines Suites Class...
2,1,"Booked, paid and received email confirmation f..."
3,5,"Best airline in the world, seats, food, servic..."
4,2,Premium Economy Seating on Singapore Airlines ...
...,...,...
9995,5,First part done with Singapore Airlines - acce...
9996,5,And again a great Flight with Singapore Air. G...
9997,5,"We flew business class from Frankfurt, via Sin..."
9998,4,"As always, the A380 aircraft was spotlessly pr..."


In [19]:
df['rating'].value_counts()

5    5424
4    1967
1    1057
3    1009
2     543
Name: rating, dtype: int64

In [21]:
df['text'] = df['text'].str.lower()
df

Unnamed: 0,rating,text
0,3,we used this airline to go from singapore to l...
1,5,the service on singapore airlines suites class...
2,1,"booked, paid and received email confirmation f..."
3,5,"best airline in the world, seats, food, servic..."
4,2,premium economy seating on singapore airlines ...
...,...,...
9995,5,first part done with singapore airlines - acce...
9996,5,and again a great flight with singapore air. g...
9997,5,"we flew business class from frankfurt, via sin..."
9998,4,"as always, the a380 aircraft was spotlessly pr..."


We will use nltk library to tokenise the data and and regex library to remove all non alpha characters.

In [22]:
df['text'] = df['text'].apply(lambda x : " ".join( [re.sub('[^A-Za-z]+','',x) for x in nltk.word_tokenize(x)] ))
df

Unnamed: 0,rating,text
0,3,we used this airline to go from singapore to l...
1,5,the service on singapore airlines suites class...
2,1,booked paid and received email confirmation f...
3,5,best airline in the world seats food servic...
4,2,premium economy seating on singapore airlines ...
...,...,...
9995,5,first part done with singapore airlines accep...
9996,5,and again a great flight with singapore air g...
9997,5,we flew business class from frankfurt via sin...
9998,4,as always the a aircraft was spotlessly prese...


In [23]:
df['text'] = df['text'].apply(lambda x: re.sub(' +',' ',x))

Now we will perform lemmatization. What it does is it groups together similar types of words to a same word hence increasing our data quality.

In [24]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x : " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
df

Unnamed: 0,rating,text
0,3,we used this airline to go from singapore to l...
1,5,the service on singapore airline suite class w...
2,1,booked paid and received email confirmation fo...
3,5,best airline in the world seat food service ar...
4,2,premium economy seating on singapore airline h...
...,...,...
9995,5,first part done with singapore airline accepta...
9996,5,and again a great flight with singapore air gr...
9997,5,we flew business class from frankfurt via sing...
9998,4,a always the a aircraft wa spotlessly presente...


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train,X_test,Y_train,Y_test = train_test_split(df['text'] , df['rating'] , test_size=0.15,random_state=21)

In [27]:
print(X_train.shape , Y_train.shape)
print(X_test.shape , Y_test.shape)

(8500,) (8500,)
(1500,) (1500,)


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
vectorizer = TfidfVectorizer()
tf_X_train = vectorizer.fit_transform(X_train)
tf_X_test = vectorizer.transform(X_test)

In [33]:
print(tf_X_train)

  (0, 1064)	0.1355984005192861
  (0, 8475)	0.10089088496782413
  (0, 16132)	0.11258651947733131
  (0, 14901)	0.0585054268736031
  (0, 5074)	0.10802553924294064
  (0, 2029)	0.25540842684662307
  (0, 11116)	0.1887264503100878
  (0, 14719)	0.07600744198502185
  (0, 709)	0.22541365632678084
  (0, 11977)	0.21755924050159015
  (0, 870)	0.06047844510892339
  (0, 16681)	0.12622627957771826
  (0, 8931)	0.0752345915664352
  (0, 8399)	0.08378548736399491
  (0, 7161)	0.07666536338018506
  (0, 12923)	0.04465752720798861
  (0, 7452)	0.22541365632678084
  (0, 12212)	0.19254228910252713
  (0, 4106)	0.23648383367410655
  (0, 14732)	0.09020390572452132
  (0, 3630)	0.22120497022833774
  (0, 7617)	0.10652337732282983
  (0, 11412)	0.17501812682836382
  (0, 7269)	0.08406877375031657
  (0, 7566)	0.23039147544755997
  :	:
  (8499, 957)	0.1934506941204232
  (8499, 5080)	0.1963074631837091
  (8499, 5619)	0.1387728585640198
  (8499, 8729)	0.2368313573203767
  (8499, 2608)	0.12495033769314863
  (8499, 6414)	0.120

Some popular learning models for learning languages are Support vector machines and neural networks, lets try and implement 3 models - a logistic regression model, an SVM and a neural network.

In [35]:
from sklearn.svm import LinearSVC
svcModel = LinearSVC()

In [36]:
svcModel.fit(tf_X_train , Y_train)

In [39]:
Y_test_svm_predictions = svcModel.predict(tf_X_test)

In [43]:
from sklearn.metrics import classification_report
svm_report = classification_report(Y_test , Y_test_svm_predictions , output_dict=True)
svm_report

{'1': {'precision': 0.6813186813186813,
  'recall': 0.7848101265822784,
  'f1-score': 0.7294117647058823,
  'support': 158},
 '2': {'precision': 0.23529411764705882,
  'recall': 0.1,
  'f1-score': 0.14035087719298245,
  'support': 80},
 '3': {'precision': 0.5,
  'recall': 0.36875,
  'f1-score': 0.4244604316546763,
  'support': 160},
 '4': {'precision': 0.4232558139534884,
  'recall': 0.30847457627118646,
  'f1-score': 0.3568627450980392,
  'support': 295},
 '5': {'precision': 0.7581493165089379,
  'recall': 0.8934324659231723,
  'f1-score': 0.820250284414107,
  'support': 807},
 'accuracy': 0.6686666666666666,
 'macro avg': {'precision': 0.5196035858856333,
  'recall': 0.49109343375532744,
  'f1-score': 0.4942672206131375,
  'support': 1500},
 'weighted avg': {'precision': 0.6287725630660722,
  'recall': 0.6686666666666666,
  'f1-score': 0.6410701915932147,
  'support': 1500}}

Let's try logistic regression

In [41]:
from sklearn.linear_model import LogisticRegression
logisticModel = LogisticRegression(max_iter=2000)

In [42]:
logisticModel.fit(tf_X_train , Y_train)
Y_test_logistic_predictions = logisticModel.predict(tf_X_test)



In [58]:
Y_test_logistic_predictions

array([4, 4, 5, ..., 5, 1, 5])

In [44]:
logistic_report = classification_report(Y_test , Y_test_logistic_predictions , output_dict=True)
logistic_report

{'1': {'precision': 0.727810650887574,
  'recall': 0.7784810126582279,
  'f1-score': 0.7522935779816514,
  'support': 158},
 '2': {'precision': 0.2727272727272727,
  'recall': 0.0375,
  'f1-score': 0.06593406593406594,
  'support': 80},
 '3': {'precision': 0.4956521739130435,
  'recall': 0.35625,
  'f1-score': 0.41454545454545455,
  'support': 160},
 '4': {'precision': 0.43169398907103823,
  'recall': 0.2677966101694915,
  'f1-score': 0.3305439330543933,
  'support': 295},
 '5': {'precision': 0.726027397260274,
  'recall': 0.919454770755886,
  'f1-score': 0.811372334609076,
  'support': 807},
 'accuracy': 0.6693333333333333,
 'macro avg': {'precision': 0.5307822967718405,
  'recall': 0.47189647871672113,
  'f1-score': 0.47493787322492825,
  'support': 1500},
 'weighted avg': {'precision': 0.619580299233002,
  'recall': 0.6693333333333333,
  'f1-score': 0.6285015450691129,
  'support': 1500}}

In [69]:
from sklearn.neural_network import MLPClassifier

In [60]:
print(tf_X_train.shape)

(8500, 16770)


In [61]:
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(tf_X_train)
tf_X_train_norm = norm.transform(tf_X_train)
tf_X_test_norm = norm.transform(tf_X_test)

In [65]:
new_nn_model = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    max_iter=3000,
    random_state=21
)

In [66]:
new_nn_model.fit(tf_X_train , Y_train)
new_nn_predictions = new_nn_model.predict(tf_X_test)
new_nn_predictions

array([4, 3, 5, ..., 5, 2, 1])

In [68]:
new_nn_report = classification_report(Y_test , new_nn_predictions , output_dict=True)
new_nn_report

{'1': {'precision': 0.6851851851851852,
  'recall': 0.7025316455696202,
  'f1-score': 0.69375,
  'support': 158},
 '2': {'precision': 0.2,
  'recall': 0.125,
  'f1-score': 0.15384615384615385,
  'support': 80},
 '3': {'precision': 0.5,
  'recall': 0.4125,
  'f1-score': 0.4520547945205479,
  'support': 160},
 '4': {'precision': 0.37362637362637363,
  'recall': 0.34576271186440677,
  'f1-score': 0.35915492957746475,
  'support': 295},
 '5': {'precision': 0.7587768969422424,
  'recall': 0.8302354399008675,
  'f1-score': 0.7928994082840238,
  'support': 807},
 'accuracy': 0.6393333333333333,
 'macro avg': {'precision': 0.5035176911507603,
  'recall': 0.4832059594669788,
  'f1-score': 0.4903410572456381,
  'support': 1500},
 'weighted avg': {'precision': 0.6178746635409528,
  'recall': 0.6393333333333333,
  'f1-score': 0.6267129907610262,
  'support': 1500}}

The neural net fails to outperform the logisitc regression or the svm, which is most probably due to the large number of features of the input data. 

In [79]:
from sklearn.decomposition import TruncatedSVD
principal = TruncatedSVD(n_components=1000)
print(tf_X_train.shape)
principal.fit(tf_X_train)
pca_x_train = principal.transform(tf_X_train)
print(pca_x_train.shape)
principal.fit(tf_X_test)
pca_x_test = principal.transform(tf_X_test)

(8500, 16770)
(8500, 1000)


In [80]:
nn_model = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    max_iter=3000,
    random_state=21
)
nn_model.fit(pca_x_train , Y_train)
new2_nn_predictions = nn_model.predict(pca_x_test)
new2_nn_predictions

array([3, 4, 5, ..., 5, 1, 5])

In [81]:
new2_nn_report = classification_report(Y_test , new2_nn_predictions , output_dict=True)
new2_nn_report

{'1': {'precision': 0.2641509433962264,
  'recall': 0.26582278481012656,
  'f1-score': 0.2649842271293375,
  'support': 158},
 '2': {'precision': 0.07766990291262135,
  'recall': 0.1,
  'f1-score': 0.08743169398907104,
  'support': 80},
 '3': {'precision': 0.1656441717791411,
  'recall': 0.16875,
  'f1-score': 0.16718266253869968,
  'support': 160},
 '4': {'precision': 0.23,
  'recall': 0.23389830508474577,
  'f1-score': 0.2319327731092437,
  'support': 295},
 '5': {'precision': 0.6438709677419355,
  'recall': 0.6183395291201983,
  'f1-score': 0.6308470290771178,
  'support': 807},
 'accuracy': 0.43,
 'macro avg': {'precision': 0.27626719716598486,
  'recall': 0.27736212380301406,
  'f1-score': 0.2764756771686939,
  'support': 1500},
 'weighted avg': {'precision': 0.4412709198280121,
  'recall': 0.43,
  'f1-score': 0.43541665996280926,
  'support': 1500}}

Let's try word2vec, another method to encode our text into some numbers

In [85]:
from gensim.utils import simple_preprocess
w2v_X_train = X_train.copy()
w2v_X_test = X_test.copy()


In [86]:
w2v_X_train = [simple_preprocess(line , deacc=True) for line in w2v_X_train]
w2v_X_test = [simple_preprocess(line , deacc=True) for line in w2v_X_test]


In [89]:
from gensim.parsing.porter import PorterStemmer
porterStemmer = PorterStemmer()
w2v_X_test = [[porterStemmer.stem(word) for word in tokens] for tokens in w2v_X_test ]
w2v_X_train = [[porterStemmer.stem(word) for word in tokens] for tokens in w2v_X_train ]
w2v_X_train[:10]

[['we',
  'recent',
  'travel',
  'to',
  'oz',
  'on',
  'holidai',
  'and',
  'although',
  'it',
  'wa',
  'good',
  'comfort',
  'flight',
  'with',
  'decent',
  'food',
  'the',
  'cabin',
  'staff',
  'were',
  'veri',
  'thin',
  'on',
  'the',
  'ground',
  'and',
  'even',
  'although',
  'most',
  'of',
  'the',
  'flight',
  'wa',
  'made',
  'dure',
  'the',
  'dai',
  'thei',
  'seem',
  'intent',
  'in',
  'put',
  'the',
  'cabin',
  'into',
  'dark',
  'and',
  'then',
  'disappear',
  'result',
  'in',
  'infrequ',
  'servic',
  'if',
  'like',
  'me',
  'you',
  'ar',
  'reluct',
  'to',
  'annoi',
  'them',
  'and',
  'press',
  'the',
  'buzzer',
  'everi',
  'time',
  'you',
  'want',
  'littl',
  'attent'],
 ['singapor',
  'airlin',
  'ar',
  'consist',
  'good',
  'with',
  'their',
  'servic',
  'onboard',
  'entertain',
  'and',
  'food',
  'it',
  'wa',
  'veri',
  'long',
  'flight',
  'and',
  'thei',
  'did',
  'nt',
  'disappoint',
  'there',
  'were',
  

In [138]:
from gensim.models import Word2Vec
stemmed_tokens = pd.Series(w2v_X_train).values

size = 1000
min_count = 3
workers = 6
sg = 1
w2vModel = Word2Vec(stemmed_tokens , vector_size=size,min_count=min_count,sg=sg,workers=workers)

In [139]:
w2vModel.save('w2v_1000.model')

In [140]:
sg_w2v_model = Word2Vec.load('w2v_1000.model')
# print(len(sg_w2v_model.wv.vocab))
with open('airline_review_w2v.csv' , 'w+') as w2v_file:
    words = list(w for w in sg_w2v_model.wv.index_to_key)
    header = ",".join(str(ele) for ele in range(size))
    w2v_file.write(header)
    w2v_file.write("\n")
    for row in w2v_X_train:
        for token in row:
            if(token in words):
                model_vector = (np.mean([sg_w2v_model.wv[token]], axis=0)).tolist()
        line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        if(len(line1) == 0):
            line1 = ",".join([str(0) for i in range(size)])
        w2v_file.write(line1)
        w2v_file.write('\n')

    

In [141]:
print(len(w2v_X_train))

8500


In [142]:
from sklearn.ensemble import HistGradientBoostingClassifier
gradboost_classifier = HistGradientBoostingClassifier()
w2v_df = pd.read_csv('airline_review_w2v.csv')
gradboost_classifier.fit(w2v_df , Y_train)

In [143]:
words = list(w for w in sg_w2v_model.wv.index_to_key)
test_features_w2v = []
for row in w2v_X_test:
    for token in row:
        if(token in words):
            model_vector = (np.mean([sg_w2v_model.wv[token]], axis=0)).tolist()
    test_features_w2v.append(model_vector)
    if(type(model_vector) != list):
        test_features_w2v.append(np.array([0 for i in range(size)]))

In [144]:
print(len(test_features_w2v))

1500


In [145]:
test_predictions_w2v = gradboost_classifier.predict(test_features_w2v)



In [146]:
w2v_report = classification_report(Y_test , test_predictions_w2v , output_dict=True)
w2v_report

{'1': {'precision': 0.375,
  'recall': 0.13291139240506328,
  'f1-score': 0.19626168224299062,
  'support': 158},
 '2': {'precision': 0.07692307692307693,
  'recall': 0.0125,
  'f1-score': 0.021505376344086023,
  'support': 80},
 '3': {'precision': 0.2,
  'recall': 0.0375,
  'f1-score': 0.0631578947368421,
  'support': 160},
 '4': {'precision': 0.24271844660194175,
  'recall': 0.0847457627118644,
  'f1-score': 0.12562814070351758,
  'support': 295},
 '5': {'precision': 0.5647149460708782,
  'recall': 0.9083023543990086,
  'f1-score': 0.6964370546318289,
  'support': 807},
 'accuracy': 0.524,
 'macro avg': {'precision': 0.2918712939191794,
  'recall': 0.23519190190318726,
  'f1-score': 0.22059802973185305,
  'support': 1500},
 'weighted avg': {'precision': 0.4164871662537451,
  'recall': 0.524,
  'f1-score': 0.42794669577015854,
  'support': 1500}}