# Configurações iniciais

In [27]:
# LIBS
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, label_binarize
import re
import warnings 
from sklearn.base import BaseEstimator, TransformerMixin
warnings.filterwarnings('ignore') 
import seaborn as sns
import joblib

# CONFIGURAÇÕES

pd.set_option('display.max_rows',100) # Ajustando número máximo de linhas a serem exibidas por dataframes
pd.set_option('display.max_columns',100) # Ajustando número máximo de colunas a serem exibidas por dataframes
pd.set_option('display.width', 100) # Ajustando tamanho a ser exibido dos valores do dataframe
pd.set_option('display.float_format', lambda x: '%.4f' % x) # Ajustando o valor dos valores floats para 4 casas decimais
%config InlineBackend.figure_format = 'retina' # Ajustando a nitidez dos gráficos
SEED = 27 # Semente para reprodução de pseudo-aleatoriedade

# DIRETÓRIOS

project_data_path = '../data' # Diretório da base de dados
model_path = '../model' # Diretório para o modelo
src_path = '../transformers' 
sys.path.insert(0, src_path)

from transformers import CombineTextColumns, FullTextPreprocessor, CustomImputer

# Carregando modelo e dados

In [28]:
# Carregando a pipeline salva
model = joblib.load(os.path.join(model_path, 'model.pkl'))

In [29]:
# Aqui faço a leitura da base de dados a partir da variável 'project_data_path', na pasta 'raw' e o arquivo 'Travel_Chalenge.csv'

df = pd.read_csv(os.path.join(project_data_path, 'raw', 'Travel_Chalenge.csv'), sep = ';')
df.sample(3) # Uma amostra aleatória de 3 registros dos dados

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
6012,Rwandair,7,"""flight attendants full of smiles""",1st February 2016,I flew with Rwandair for the first time in Jul...,,Business,Business Class,DAR to KGL,July 2015,3.0,4.0,3.0,3.0,1.0,1.0
11143,Ethiopian Airlines,7,"""helped us when we needed it most""",28th April 2022,"My dad, sister, uncle and I are traveling to...",,Family Leisure,Economy Class,Washington to Harare via Addis Ababa,April 2022,2.0,4.0,4.0,5.0,2.0,1.0
13042,Sun Country Airlines,5,"""Continue doing an amazing job""",30th July 2021,Thanks for having staff like Khiry Morgan. T...,,Family Leisure,Economy Class,Minneapolis to Los Angeles,July 2021,2.0,3.0,,1.0,1.0,1.0


Para verificar o impacto de atrasos de viagem no NPS, resolvi buscar essa informação diretamente nos textos das reviews e a partir disso realizar as seguintes etapas:

- Filtrar na base de teste 3 companhias aéreas
- Com o modelo de machine learning (model.predict()) prever as labels (negativo, neutro, positivo)
- Calcular NPS (%positivos-%negativos) usando a previsão dada pelo modelo
- Filtrar comentários que não contenham "Delay" e que contenham
- Fazer os cálculos de NPS
- Mostrar a diferença / impacto do atraso no NPS.

Além disso, para interpretar o valor do NPS, adotei a classificação:

- **Promotores (Pontuação 9-10)**: Clientes entusiasmados que são propensos a recomendar a empresa e a se engajar mais profundamente.
- **Neutros (Pontuação 7-8)**: Clientes satisfeitos, mas não tão entusiásticos a ponto de recomendar ativamente.
- **Detratores (Pontuação 0-6)**: Clientes insatisfeitos que podem divulgar uma opinião negativa e prejudicar a reputação da empresa.

# Sorteando 3 companhias aéreas

In [30]:
# Filtrar 3 companhias aereas
companys = df['Airline Name'].sample(3, random_state = SEED).values

df = df[df['Airline Name'].isin(companys)].reset_index(drop = True)
df.sample(3)

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
82,Condor Airlines,1,"""Never travelled in such poor conditions and f...",18th June 2022,Awful. Impossible to check in online. 20 min ...,,Couple Leisure,Economy Class,Toronto to Paris via Frankfurt,June 2022,1.0,1.0,1.0,1.0,1.0,1.0
146,Condor Airlines,1,"""baggage has still not been located""",22nd June 2022,Traveled for a family camping trip and Condo...,,Couple Leisure,Economy Class,Amsterdam to Portland via Frankfurt,June 2022,1.0,1.0,1.0,1.0,1.0,1.0
133,Condor Airlines,2,"""customer service was pathetically bad""",21st July 2022,Everything written above about Condor is tru...,,Family Leisure,Economy Class,Helsinki to Havana via Frankfurt,June 2022,1.0,2.0,1.0,1.0,1.0,1.0


In [31]:
df['Airline Name'].unique() # Companhias sorteadas

array(['Jet2.com', 'Condor Airlines', 'Air Canada'], dtype=object)

# Tratando variável `Overall_Rating` 

In [32]:
# Tratando variável de interesse
df.loc[df['Overall_Rating'] == 'n', 'Overall_Rating'] = int(df.loc[df['Overall_Rating'] != 'n', 'Overall_Rating'].astype(int).median())
df['Overall_Rating'] = df['Overall_Rating'].astype(int)

# Avaliando atraso (delay) no geral

Avaliando média de `Overall_Rating` com base nas Reviews que possuem "delay" no texto. 

In [34]:
# Com delay
df.loc[df['Review'].str.contains('delay'), 'Overall_Rating'].mean()

np.float64(1.8631578947368421)

In [35]:
# Sem delay
df.loc[~df['Review'].str.contains('delay'), 'Overall_Rating'].mean()

np.float64(2.292682926829268)

Percebe-se como a base sem os reviews relacionados a 'delay' a média é quase 1 unidade maior.

In [36]:
# Geral
df['Overall_Rating'].mean()

np.float64(2.1566666666666667)

Aparentemente no geral, reviews com 'delay' no texto parecem impactar negativamente a média de `Overall_Rating` para essa amostra de companhias. 

## Utilizando o modelo para classificar reviews

In [37]:
df_aux = df[['Review', 'Review_Title']].copy()
df_aux.sample(3) 

Unnamed: 0,Review,Review_Title
288,I am disgusted with this company. I have flo...,"""disgusted with this company"""
13,We checked in for our business class flight ...,"""never set foot on a Condor flight again"""
107,Stansted to Thessaloniki. Booked holiday thr...,"""unwilling to refund deposit"""


In [38]:
y_pred = model.predict(df_aux)

In [39]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [41]:
df_aux['sentimental_predict'] = pd.Series(y_pred).map({0: 'negative', 1: 'neutral', 2: 'positive'})

In [49]:
df_aux

Unnamed: 0,Review,Review_Title,sentimental_predict
0,Barcelona to Manchester with Jet2.com. First t...,"""Service was great""",negative
1,Frankfurt to Cancun. Misleading upgrading pr...,"""Misleading upgrading process""",negative
2,Manchester to Funchal return on rather elder...,"""On board service was good""",negative
3,I think this is the 5th time I’ve flown with ...,"""Air Canada is inconsistent""",negative
4,Air Canada cancelled our flight two hours be...,"""cancelled our flight two hours before""",negative
...,...,...,...
295,The customer service agent at the Montreal-Tr...,"""This airline needs to wake up""",negative
296,Frankfurt to Cancun. Never fly with Condor a...,"""everything like a budget flight""",negative
297,"So have used Jet2, recommended them to family...","""They again trotted out the same excuse""\r\n**...",negative
298,I received an email 1 hour before the flight...,"""flight got postponed for 24 hours""",negative


In [55]:
df = df_aux.merge(df, how = 'inner')

In [56]:
df.head()

Unnamed: 0,Review,Review_Title,sentimental_predict,Airline Name,Overall_Rating,Review Date,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
0,Barcelona to Manchester with Jet2.com. First t...,"""Service was great""",negative,Jet2.com,1,10th December 2019,,Couple Leisure,Economy Class,Barcelona to Manchester,November 2019,4.0,5.0,4.0,5.0,,
1,Frankfurt to Cancun. Misleading upgrading pr...,"""Misleading upgrading process""",negative,Condor Airlines,1,10th February 2020,Boeing 767-300ER,Couple Leisure,Premium Economy,Frankfurt to Cancun,January 2020,1.0,1.0,2.0,1.0,1.0,
2,Manchester to Funchal return on rather elder...,"""On board service was good""",negative,Jet2.com,7,10th January 2019,Boeing 757-200,Couple Leisure,Economy Class,Funchal to Manchester,January 2019,2.0,4.0,3.0,4.0,,
3,I think this is the 5th time I’ve flown with ...,"""Air Canada is inconsistent""",negative,Air Canada,1,10th July 2023,,Business,Economy Class,Vancouver to Edmonton,July 2023,1.0,2.0,,1.0,1.0,1.0
4,Air Canada cancelled our flight two hours be...,"""cancelled our flight two hours before""",negative,Air Canada,1,10th July 2023,,Family Leisure,Economy Class,Calgary to Los Angeles,July 2023,,,,1.0,,


In [61]:
# Filtrando as companhias

company1 = df.loc[df['Airline Name'] == 'Jet2.com']
company2 = df.loc[df['Airline Name'] == 'Condor Airlines']
company3 = df.loc[df['Airline Name'] == 'Air Canada']

In [63]:
# NPS

def calc_nps(data):
    pct_positives = data.loc[data['sentimental_predict'] == 'positive'].shape[0] / data.shape[0]
    pct_negatives = data.loc[data['sentimental_predict'] == 'negative'].shape[0] / data.shape[0]
    nps = pct_positives - pct_negatives
    return nps

## Calculando NPS para companhia 1 (Jet2.com)

In [64]:
# NPS geral
calc_nps(company1)

-0.72

No caso da companhia Jet2.com, o NPS indica que há mais detratores do que promotores, o que pode sinalizar problemas com a companhia ou serviços.

In [73]:
# Calculando NPS para delays

reviews_delay_company1 = company1[company1['Review'].str.contains('delay')]
reviews_nodelay_company1 = company1[~company1['Review'].str.contains('delay')]

# Informações gerais

calc_nps(reviews_delay_company1), calc_nps(reviews_nodelay_company1)

(-1.0, -0.6499999999999999)

In [74]:
calc_nps(reviews_delay_company1) - calc_nps(reviews_nodelay_company1)

-0.3500000000000001

nota-se que reviews que contem 'delay' no texto concentram bem mais detratores que no caso das reviews sem essa palavra-chave.

## Calculando NPS para companhia 2 (Condor Airlines)

In [69]:
# NPS geral
calc_nps(company2)

-0.9099999999999999

In [75]:
# Calculando NPS para delays

reviews_delay_company2 = company2[company2['Review'].str.contains('delay')]
reviews_nodelay_company2 = company2[~company2['Review'].str.contains('delay')]

# Informações gerais

calc_nps(reviews_delay_company2), calc_nps(reviews_nodelay_company2)

(-0.9428571428571428, -0.8923076923076922)

Mais um caso de um NPS maior para reviews que contém 'delay'

## Calculando NPS para companhia 3 (Air Canada)

In [71]:
# NPS geral
calc_nps(company3)

-0.94

In [76]:
# Calculando NPS para delays

reviews_delay_company3 = company3[company3['Review'].str.contains('delay')]
reviews_nodelay_company3 = company3[~company3['Review'].str.contains('delay')]

# Informações gerais

calc_nps(reviews_delay_company3), calc_nps(reviews_nodelay_company3)

(-1.0, -0.8999999999999999)

Outro caso onde reviews com 'delay' possuem menor NPS. 

# Conclusão

Como conclusão, o uso do modelo para a tarefa de estimar os sentimentos dos textos funcionou bem e parece funcionar bem para estimar o NPS das companhias.  
Além disso, de acordo com o NPS calculado, reviews que possuem 'delay' (atraso) no texto realmente possuem um valor de NPS menor comparado as reviews sem 'delay' e a base geral, indicando que **os atrasos impactam diretamente no NPS das companhias.**