In [1]:
import nltk
nltk.download('vader_lexicon')
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import os 

import warnings 
warnings.filterwarnings('ignore')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/stella/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
cwd = "/Users/stella/Downloads/Portfolio/Kenya-Airways-Analysis/"

In [3]:
data = pd.read_csv("data/merged_output.csv")

In [4]:
# Class to describe the data 
class Describer:
    
    # initialize object
    def __init__(self, data):
        self.data = data
        
    # method to check shape of data
    def shape(self):
        shape_data = print(f"The DataFrame has:\n\t* {self.data.shape[0]} rows\n\t* {self.data.shape[1]} columns", '\n')
        return shape_data
    
    # method to check info on dataset
    def data_info(self):
        info_data = print(self.data.info(), '\n')
        return info_data
    
    # method to describe numerical columns
    def data_describe(self):
        num_col = self.data.describe()
        return num_col

In [5]:
# Shape of the data 

# creating an instance of the class describer
describe_data = Describer(data)

# Viewing the shape of the data
describe_data.shape()

The DataFrame has:
	* 4917 rows
	* 7 columns 



In [6]:
# summary information of the data
print('Summary infomation on dataset')
print('-----------------------------------------------')
describe_data.data_info()

Summary infomation on dataset
-----------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4917 entries, 0 to 4916
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      4917 non-null   int64  
 1   reviews         4917 non-null   object 
 2   date            4917 non-null   object 
 3   verified        4917 non-null   bool   
 4   corpus          4917 non-null   object 
 5   sentiment       4917 non-null   float64
 6   sentiment_type  4917 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 235.4+ KB
None 



In [7]:
# summary statistics
describe_data.data_describe()

Unnamed: 0.1,Unnamed: 0,sentiment
count,4917.0,4917.0
mean,1046.166158,0.364721
std,672.692823,0.743467
min,0.0,-0.9978
25%,409.0,-0.4019
50%,1019.0,0.8271
75%,1634.0,0.9623
max,2274.0,0.9997


In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,reviews,date,verified,corpus,sentiment,sentiment_type
0,0,My family experienced very poor service with E...,2023-06-26,True,family experienced poor service emirate issue ...,-0.3804,negative
1,1,This review is to appreciate the service given...,2023-06-25,True,review appreciate service given emirate flying...,0.5994,positive
2,2,I'm motivated to write this critical review af...,2023-06-24,True,motivated write critical review multiple phone...,-0.95,negative
3,3,Excellent service. I flew Dubai- B'Ham- Dubai ...,2023-06-24,True,excellent service flew dubai b ham dubai econo...,0.9811,positive
4,4,What a disappointment. Emirates is lauded as h...,2023-06-17,True,disappointment emirate lauded one best infligh...,0.8525,positive


In [9]:
data.drop(columns=["Unnamed: 0"], inplace=True)

In [10]:
# Checking the drop of the column

data

Unnamed: 0,reviews,date,verified,corpus,sentiment,sentiment_type
0,My family experienced very poor service with E...,2023-06-26,True,family experienced poor service emirate issue ...,-0.3804,negative
1,This review is to appreciate the service given...,2023-06-25,True,review appreciate service given emirate flying...,0.5994,positive
2,I'm motivated to write this critical review af...,2023-06-24,True,motivated write critical review multiple phone...,-0.9500,negative
3,Excellent service. I flew Dubai- B'Ham- Dubai ...,2023-06-24,True,excellent service flew dubai b ham dubai econo...,0.9811,positive
4,What a disappointment. Emirates is lauded as h...,2023-06-17,True,disappointment emirate lauded one best infligh...,0.8525,positive
...,...,...,...,...,...,...
4912,Flew first class CAI-DOH on QR503 on 8th Aug 2...,2013-08-12,False,flew first class cai doh qr th aug qatar vip l...,0.9780,positive
4913,MAD-DOH-DXB and back. Boeing 777 in MAD-DOH se...,2013-08-06,False,mad doh dxb back boeing mad doh sector nice sp...,-0.9092,negative
4914,GVA-AUH A320. Good food but after the main mea...,2013-08-06,False,gva auh good food main meal nothing else serve...,0.6597,positive
4915,I took a flight to London on 17th July with my...,2013-08-06,False,took flight london th july year old son wife t...,-0.6256,negative


In [11]:
# Convert continuous sentiment scores to categorical labels
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x > 0.2 else (-1 if x < 0 else 0))
data['sentiment'].value_counts(normalize=True)

 1    0.667684
-1    0.306691
 0    0.025625
Name: sentiment, dtype: float64

In [12]:
# Training and Test split

X = data['corpus']  # Features
y = data['sentiment']   # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Feature Extraction
vect = CountVectorizer()
x_train_vect = vect.fit_transform(X_train)
x_test_vect = vect.transform(X_test)

## <b> Modeling </b>

Baseline model

In [14]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train_vect, y_train)

In [15]:
# Model Evaluation
y_pred = model.predict(x_test_vect)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8384146341463414
Classification Report:
              precision    recall  f1-score   support

          -1       0.77      0.75      0.76       307
           0       0.00      0.00      0.00        22
           1       0.87      0.91      0.89       655

    accuracy                           0.84       984
   macro avg       0.55      0.55      0.55       984
weighted avg       0.82      0.84      0.83       984



Based on this we note that the baseline model has an accuracy score of 83% 

We will try using other model to see if the scores improve by using other models

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [19]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train_vect, y_train)

# Model Evaluation

dt_y_pred = dt_model.predict(x_test_vect)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_y_pred))


Decision Tree Accuracy: 0.7266260162601627
Decision Tree Classification Report:
              precision    recall  f1-score   support

          -1       0.59      0.59      0.59       307
           0       0.00      0.00      0.00        22
           1       0.81      0.81      0.81       655

    accuracy                           0.73       984
   macro avg       0.47      0.47      0.47       984
weighted avg       0.72      0.73      0.72       984



In [20]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(x_train_vect, y_train)

# Model Evaluation

rf_y_pred = rf_model.predict(x_test_vect)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))


Random Forest Accuracy: 0.7997967479674797
Random Forest Classification Report:
              precision    recall  f1-score   support

          -1       0.81      0.54      0.65       307
           0       0.00      0.00      0.00        22
           1       0.80      0.95      0.87       655

    accuracy                           0.80       984
   macro avg       0.54      0.50      0.51       984
weighted avg       0.78      0.80      0.78       984



In [21]:
# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(x_train_vect, y_train)

# Model Evaluation

svm_y_pred = svm_model.predict(x_test_vect)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(classification_report(y_test, svm_y_pred))

SVM Accuracy: 0.8211382113821138
SVM Classification Report:
              precision    recall  f1-score   support

          -1       0.77      0.67      0.72       307
           0       0.00      0.00      0.00        22
           1       0.84      0.92      0.88       655

    accuracy                           0.82       984
   macro avg       0.54      0.53      0.53       984
weighted avg       0.80      0.82      0.81       984



In [22]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', SVC())
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predictions on the testing data
y_pred = pipeline.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8211382113821138
Classification Report:
              precision    recall  f1-score   support

          -1       0.77      0.67      0.72       307
           0       0.00      0.00      0.00        22
           1       0.84      0.92      0.88       655

    accuracy                           0.82       984
   macro avg       0.54      0.53      0.53       984
weighted avg       0.80      0.82      0.81       984



In [23]:
import pickle

with open('sentiment_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)