In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer #menghitung kata sentiment
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline


In [8]:
df = pd.read_csv('british.csv', delimiter = ';')

In [9]:
df.head(5)

Unnamed: 0,airlines,date_post,text,sentiment
0,british-airways,2018-09-12,| London Heathrow to Dubai. This was the fir...,POSITIVE
1,british-airways,2018-09-11,| Heraklion to Gatwick. Left my luggage behi...,NEGATIVE
2,british-airways,2018-09-10,| Flew Madrid to London Heathrow. The 767 is ...,POSITIVE
3,british-airways,2018-09-10,| London Heathrow to Brindisi. We weren’t ex...,NEGATIVE
4,british-airways,2018-09-08,| I have BA silver membership. This promises ...,NEGATIVE


In [11]:
df.shape


(2610, 4)

In [12]:
df['sentiment'].value_counts()

NEGATIVE    1439
POSITIVE    1171
Name: sentiment, dtype: int64

In [15]:
df['text'] = df['text'].map(lambda x: x.lstrip(' | '))
#lstrip = left strip

In [17]:
df.head(10)

Unnamed: 0,airlines,date_post,text,sentiment
0,british-airways,2018-09-12,London Heathrow to Dubai. This was the first t...,POSITIVE
1,british-airways,2018-09-11,Heraklion to Gatwick. Left my luggage behind o...,NEGATIVE
2,british-airways,2018-09-10,Flew Madrid to London Heathrow. The 767 is anc...,POSITIVE
3,british-airways,2018-09-10,London Heathrow to Brindisi. We weren’t expect...,NEGATIVE
4,british-airways,2018-09-08,I have BA silver membership. This promises fas...,NEGATIVE
5,british-airways,2018-09-08,Hamburg to London. Initial flight from London ...,NEGATIVE
6,british-airways,2018-09-07,Glasgow to Miami via London. Glasgow to Heathr...,POSITIVE
7,british-airways,2018-09-06,London Heathrow to Budapest. After so much dis...,POSITIVE
8,british-airways,2018-09-05,Budapest to London Heathrow. The flight depart...,POSITIVE
9,british-airways,2018-09-03,London to Toronto. Group of 10 people age from...,POSITIVE


In [18]:
df['sentiment'] = df['sentiment'].replace({'POSITIVE': 1, 'NEGATIVE': 0})

In [19]:
df.head(10)

Unnamed: 0,airlines,date_post,text,sentiment
0,british-airways,2018-09-12,London Heathrow to Dubai. This was the first t...,1
1,british-airways,2018-09-11,Heraklion to Gatwick. Left my luggage behind o...,0
2,british-airways,2018-09-10,Flew Madrid to London Heathrow. The 767 is anc...,1
3,british-airways,2018-09-10,London Heathrow to Brindisi. We weren’t expect...,0
4,british-airways,2018-09-08,I have BA silver membership. This promises fas...,0
5,british-airways,2018-09-08,Hamburg to London. Initial flight from London ...,0
6,british-airways,2018-09-07,Glasgow to Miami via London. Glasgow to Heathr...,1
7,british-airways,2018-09-06,London Heathrow to Budapest. After so much dis...,1
8,british-airways,2018-09-05,Budapest to London Heathrow. The flight depart...,1
9,british-airways,2018-09-03,London to Toronto. Group of 10 people age from...,1


In [21]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    df['sentiment'],
                                                    random_state=0)

In [22]:
X_test.index

Int64Index([2060, 2419,  192, 2531,  913, 1101, 2228, 2365, 1543, 2266,
            ...
            1591,  963, 2514,  305, 2401,  442, 1604,  157, 1831, 2349],
           dtype='int64', length=653)

In [23]:
print(len(y_train), len(y_test))

1957 653


In [24]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 London to Bangkok. I was a bit apprehensive about flying BA Club World due to the bad reviews the airline has received recently. Boarding at Heathrow was well organised due to their group boarding policy. On board I offered a pre Champagne or water. Once airborne choose gin and tonic and cashew nuts. Meal pre ordered online. I found all three courses exceptional and very well cooked. The staff worked tireless right through the twelve hour flight. I know BA Club Seats are not ideal, but nevertheless they are still more comfortable than any Middle East airline I have flown with. Entertainment screen poor and needs urgent revamp, cannot see when lights are on in the cabin. Very nice to see a mixed aged crew that shows ageism does not exist at BA. I also must commend BA for their quick medical response to a passenger who had collapsed on the aircraft in club. There rapid response and actions probably saved his life, I flew on 12 January. Well done BA, you are pulling

In [25]:
vect = TfidfVectorizer(min_df=5, stop_words='english', use_idf=True, ngram_range=(1,2)).fit(X_train)
len(vect.get_feature_names())

6127

In [27]:
X_train_vectorized = vect.transform(X_train)

# Classification


# Naive Bayes

In [29]:
mnb = MultinomialNB()

mnb.fit(X_train_vectorized, y_train) #membuat model
predictions = mnb.predict(vect.transform(X_test))
#evaluasi
print('AUC: ', roc_auc_score(y_test, predictions)) #data yang mirip-mirip antara 1 dan 0 Area Under Curve
print('Confusion Metrix: ', confusion_matrix(y_test, predictions)) #
print('ACC: ', accuracy_score(y_test, predictions)) #menghitung ketepatan/akurasi dari 653 


AUC:  0.8459423587409935
Confusion Metrix:  [[340  20]
 [ 74 219]]
ACC:  0.8560490045941807


# Visualize