## Setup

In [None]:
import pandas as pd
import spacy
import re

import warnings

pd.set_option('display.max_columns', None)

nlp = spacy.load("en_core_web_sm")

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def preprocess_text_spacy(text):
    if pd.isna(text):
        return ""
    # 1. Minúsculas
    text = text.lower()
    # 2. Remoção de pontuação e números
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # 3. Processamento com spaCy (Tokenização, remoção de stopwords e lematização)
    doc = nlp(text)
    clean_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(clean_tokens)

In [3]:
def categorize_rating(rating):
    if rating < 4:
        return 'Negativo'
    elif 4 <= rating <= 7:
        return 'Neutro'
    else:
        return 'Positivo'

## 1. Entendimento e Processamento dos dados

* Visão geral dos dados
* Identificação e exclusão de dados duplicados
* Identificação de dados não validos de 'Overall_Rating' e exclusão destes
* Conversão do tipo de dado: Overall_Rating -> numeric; Review Date e Date Flown -> datetime
* Padronização do nome das features

In [4]:
df = pd.read_csv('../data/raw/Travel_Chalenge.csv', sep=';')
df.head(2)

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
0,Air Seychelles,8,Air Seychelles customer review,10th April 2011,LHR-SEZ-LHR. It's true that there aren't perso...,,,Economy Class,,,,,,,,
1,CityJet,1,CityJet customer review,10th April 2011,London City to Antwerp was delayed 3 hours 40 ...,,,Economy Class,,,,,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17702 entries, 0 to 17701
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Airline Name            17702 non-null  object 
 1   Overall_Rating          17702 non-null  object 
 2   Review_Title            17702 non-null  object 
 3   Review Date             17702 non-null  object 
 4   Review                  17702 non-null  object 
 5   Aircraft                5665 non-null   object 
 6   Type Of Traveller       15953 non-null  object 
 7   Seat Type               17461 non-null  object 
 8   Route                   15876 non-null  object 
 9   Date Flown              15944 non-null  object 
 10  Seat Comfort            15456 non-null  float64
 11  Cabin Staff Service     15366 non-null  float64
 12  Food & Beverages        11924 non-null  float64
 13  Ground Service          15056 non-null  float64
 14  Inflight Entertainment  9231 non-null 

In [6]:
((df.isnull().sum() / len(df)) * 100).round(2)

Airline Name               0.00
Overall_Rating             0.00
Review_Title               0.00
Review Date                0.00
Review                     0.00
Aircraft                  68.00
Type Of Traveller          9.88
Seat Type                  1.36
Route                     10.32
Date Flown                 9.93
Seat Comfort              12.69
Cabin Staff Service       13.20
Food & Beverages          32.64
Ground Service            14.95
Inflight Entertainment    47.85
Wifi & Connectivity       70.38
dtype: float64

In [7]:
df.duplicated().sum()

np.int64(29)

In [8]:
df[df.duplicated()].head()

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
514,Edelweiss Air,n,Edelweiss Air customer review,10th September 2003,I flew Edelweiss to Antalya a couple of days a...,,,,,,,,,,,
516,Kulula,n,Kulula customer review,10th September 2006,Great airline with its excellent prices and ca...,,,,,,,,,,,
1065,Luxair,n,Luxair customer review,11th October 2004,Economy from Manchester to Luxembourg. Very ni...,,,,,,,,,,,
1352,Luxair,4,Luxair customer review,12th July 2007,LUX-CDG on a ERJ and back on new Q400. As alwa...,,,,,,,,,,,
1549,Kulula,n,Kulula customer review,12th May 2007,Cape Town to Johannesburg. Very cheap ticket w...,,,,,,,,,,,


In [9]:
df['Review'].iloc[514]

'I flew Edelweiss to Antalya a couple of days ago and back on its only A-330. The service was excellent and cabin crew very friendly. More movies would have been great. The plane was flown by CEO Kistler on the leg to Antalya. Unfortunately check-in at Antalya took relatively long since only 3 counters were open for an A-330-flight.'

In [10]:
df[df.Review == 'I flew Edelweiss to Antalya a couple of days ago and back on its only A-330. The service was excellent and cabin crew very friendly. More movies would have been great. The plane was flown by CEO Kistler on the leg to Antalya. Unfortunately check-in at Antalya took relatively long since only 3 counters were open for an A-330-flight.']

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
513,Edelweiss Air,n,Edelweiss Air customer review,10th September 2003,I flew Edelweiss to Antalya a couple of days a...,,,,,,,,,,,
514,Edelweiss Air,n,Edelweiss Air customer review,10th September 2003,I flew Edelweiss to Antalya a couple of days a...,,,,,,,,,,,


In [11]:
df.drop_duplicates(inplace=True)
df.shape

(17673, 16)

In [12]:
df.sample(3)

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
1263,China Southern Airlines,1,"""a very pleasant long trip""",12th December 2022,I would like to thank China Southern Airline...,,Solo Leisure,Economy Class,Guangzhou to Toronto,December 2022,5.0,5.0,5.0,5.0,,
5429,Caribbean Airlines,8,Caribbean Airlines customer review,19th January 2015,Flew from Barbados to Kingston last week. All ...,,,Economy Class,,,4.0,4.0,3.0,,2.0,
6580,Germanwings,n,Germanwings customer review,20th December 2012,Flew to Cologne from LHR - return flight had t...,,,Economy Class,,,4.0,1.0,0.0,,0.0,


In [13]:
df[df['Wifi & Connectivity'].notnull()].sample(3)

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
162,CSA Czech Airlines,9,"""Staff was nice and friendly""",10th January 2020,Prague to Copenhagen. Flight was operated by ...,A319,Solo Leisure,Economy Class,Prague to Copenhagen,October 2019,5.0,5.0,4.0,5.0,2.0,1.0
4446,Air Astana,1,"""Food was perfect""",17th May 2019,St-Petersburg to Istanbul via Almaty. A21 n...,A321neo / A320.,Solo Leisure,Economy Class,St-Petersburg to Istanbul via Almaty,April 2019,5.0,5.0,5.0,5.0,4.0,4.0
7312,Vietnam Airlines,4,"""stop raising their voice""",21st January 2021,Cabin Crew attitude is hostile and disrespec...,Boeing 787,Business,Economy Class,Tân sơn Nhat to Hanoi,January 2021,2.0,1.0,3.0,3.0,4.0,3.0


In [14]:
df.describe()

Unnamed: 0,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
count,15456.0,15366.0,11924.0,15056.0,9231.0,5243.0
mean,2.598344,2.850709,2.536313,2.330433,2.244611,1.800877
std,1.456198,1.595379,1.513762,1.591097,1.486511,1.327095
min,0.0,0.0,0.0,1.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,3.0,3.0,2.0,1.0,2.0,1.0
75%,4.0,4.0,4.0,4.0,3.0,2.0
max,5.0,5.0,5.0,5.0,5.0,5.0


In [15]:
df.describe(include='object')

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown
count,17673,17673,17673,17673,17673,5665,15953,17461,15876,15944
unique,182,10,14268,3679,17670,835,4,4,11294,108
top,CityJet,1,Onur Air customer review,16th July 2023,Very attentive staff at airport check in and...,A320,Solo Leisure,Economy Class,Melbourne to Sydney,June 2023
freq,100,9370,84,64,2,768,5780,14987,38,998


In [16]:
df[df.duplicated(subset=['Review'], keep=False)][['Airline Name', 'Review_Title', 'Overall_Rating','Review', 'Review Date', 'Type Of Traveller', 'Route', 'Aircraft','Date Flown']]

Unnamed: 0,Airline Name,Review_Title,Overall_Rating,Review,Review Date,Type Of Traveller,Route,Aircraft,Date Flown
9167,China Southern Airlines,"""Very attentive staff at airport""",1,Very attentive staff at airport check in and...,24th March 2022,Solo Leisure,Guangzhou to Auckland,,March 2022
10168,Tunisair,Tunisair customer review,2,LHR-TUN return to LHR. Aircraft was A320 on bo...,26th July 2012,,,,
13004,Alaska Airlines,"""Not a great airline at all""",1,Terrible attitude from the lady at 11am at g...,30th January 2023,Family Leisure,San Francisco to Newark,Boeing 737-900,January 2023
13005,Alaska Airlines,"""flight attendants did the bare minimum""",1,Terrible attitude from the lady at 11am at g...,30th January 2023,Family Leisure,San Francisco to Newark,Boeing 737-900,January 2023
13448,China Southern Airlines,"""happily fly this airline again""",8,Very attentive staff at airport check in and...,31st December 2022,Solo Leisure,Guangzhou to Paris,,December 2022
17161,Tunisair,Tunisair customer review,9,LHR-TUN return to LHR. Aircraft was A320 on bo...,9th April 2012,,,,


In [17]:
df.columns

Index(['Airline Name', 'Overall_Rating', 'Review_Title', 'Review Date',
       'Review', 'Aircraft', 'Type Of Traveller', 'Seat Type', 'Route',
       'Date Flown', 'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages',
       'Ground Service', 'Inflight Entertainment', 'Wifi & Connectivity'],
      dtype='object')

In [18]:
df.drop_duplicates(subset=['Airline Name', 'Review', 'Overall_Rating', 'Date Flown', 'Route', 'Aircraft'], keep='last', inplace=True)
df.describe(include='object')

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown
count,17672,17672,17672,17672,17672,5664,15952,17460,15875,15943
unique,182,10,14267,3679,17670,835,4,4,11294,108
top,CityJet,1,Onur Air customer review,16th July 2023,Very attentive staff at airport check in and...,A320,Solo Leisure,Economy Class,Melbourne to Sydney,June 2023
freq,100,9369,84,64,2,768,5780,14986,38,998


In [19]:
df.Overall_Rating.value_counts(normalize=True).round(2) * 100

Overall_Rating
1    53.0
2    11.0
9     7.0
8     7.0
3     6.0
7     5.0
4     3.0
5     3.0
6     3.0
n     2.0
Name: proportion, dtype: float64

In [20]:
df[df.Overall_Rating == 'n'].head(3)

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity
74,Aerolineas Argentinas,n,Aerolineas Argentinas customer review,10th December 2014,EZE-JFK. The flight was supposed to leave at 1...,,,Economy Class,,,1.0,1.0,2.0,,4.0,
389,Regional Express,n,Regional Express customer review,10th May 2005,Flew REX from Sydney to Lismore January 2005. ...,,,,,,,,,,,
394,US Airways,n,US Airways customer review,10th May 2015,My wife and I decided to travel to Anguilla fo...,,,Economy Class,,,2.0,1.0,1.0,,0.0,


In [21]:
df.drop(df[df.Overall_Rating == 'n'].index, inplace=True)
df.shape

(17367, 16)

In [None]:
df['Review Date'] = pd.to_datetime(df['Review Date'], errors='coerce')
df['Date Flown'] = pd.to_datetime(df['Date Flown'], errors='coerce')

df['Overall_Rating'] = pd.to_numeric(df['Overall_Rating'], errors='coerce')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17367 entries, 0 to 17701
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Airline Name            17367 non-null  object        
 1   Overall_Rating          17367 non-null  int64         
 2   Review_Title            17367 non-null  object        
 3   Review Date             13557 non-null  datetime64[ns]
 4   Review                  17367 non-null  object        
 5   Aircraft                5664 non-null   object        
 6   Type Of Traveller       15952 non-null  object        
 7   Seat Type               17249 non-null  object        
 8   Route                   15875 non-null  object        
 9   Date Flown              15943 non-null  datetime64[ns]
 10  Seat Comfort            15315 non-null  float64       
 11  Cabin Staff Service     15225 non-null  float64       
 12  Food & Beverages        11783 non-null  float64    

In [24]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('&', 'e').str.replace(r'[^\w]', '', regex=True)
df.columns

Index(['airline_name', 'overall_rating', 'review_title', 'review_date',
       'review', 'aircraft', 'type_of_traveller', 'seat_type', 'route',
       'date_flown', 'seat_comfort', 'cabin_staff_service', 'food_e_beverages',
       'ground_service', 'inflight_entertainment', 'wifi_e_connectivity'],
      dtype='object')

## 2- Processamento Texto

* Limpeza dos campos review e review_title

    * Conversão para minúscula
    * Remoção de pontuação e números
    * Remoção de stop-words e lematização


In [26]:
# Aplicação da função de pré-processamento nos textos das colunas 'review' e 'review_title'
df["review_clean"] = df["review"].apply(preprocess_text_spacy)
df["review_title_clean"] = df["review_title"].apply(preprocess_text_spacy)

df[["review", "review_clean", "review_title", "review_title_clean"]].head()


Unnamed: 0,review,review_clean,review_title,review_title_clean
0,LHR-SEZ-LHR. It's true that there aren't perso...,lhrsezlhr true not personal tv like virgin ba ...,Air Seychelles customer review,air seychelle customer review
1,London City to Antwerp was delayed 3 hours 40 ...,london city antwerp delay hour min sure tr...,CityJet customer review,cityjet customer review
2,PEK to HGH in Economy. Slight delay out of PEK...,pek hgh economy slight delay pek uneventful fl...,Shanghai Airlines customer review,shanghai airlines customer review
3,Rio to Buenos Aires AEP on 737. Departure dela...,rio buenos aire aep departure delay min arri...,Aerolineas Argentinas customer review,aerolineas argentinas customer review
4,Outbound flight FRA/PRN A319. 2 hours 10 min f...,outbound flight fraprn hour min flight thi...,Adria Airways customer review,adria airways customer review


## 3. Feature Engineering

* Quebra das colunas de data
    * Dia da semana
    * Ano-Mês
    * Ano
    * Mês
    * Dias de diferença entre voo e avaliação
* Quebra da feature overall_rating
    * Criação de grupos: Negativo(-1), Neutro (0) e Positivo (1)
* Concatenação de review_title e review
* Criação de feature que indica atraso no voo

In [27]:
# Criar coluna com o dia da semana
df['review_day_of_week'] = df['review_date'].dt.day_name()
df['date_flown_day_of_week'] = df['date_flown'].dt.day_name()

# Criar coluna com ano-mês
df['review_year_month'] = df['review_date'].dt.to_period('M')
df['date_flown_year_month'] = df['date_flown'].dt.to_period('M')

# Criar coluna com ano
df['review_year'] = df['review_date'].dt.year
df['date_flown_year'] = df['date_flown'].dt.year

# Criar coluna com mês
df['review_month'] = df['review_date'].dt.month
df['date_flown_month'] = df['date_flown'].dt.month

# Criar coluna com a diferença de dias entre data do voo e data do review
df['days_difference'] = (df['review_date'] - df['date_flown']).dt.days

df[['review_day_of_week', 'date_flown_day_of_week', 'review_year_month', 'date_flown_year_month', 
    'review_year', 'date_flown_year', 'review_month', 'date_flown_month', 'days_difference']].head()

Unnamed: 0,review_day_of_week,date_flown_day_of_week,review_year_month,date_flown_year_month,review_year,date_flown_year,review_month,date_flown_month,days_difference
0,Sunday,,2011-04,NaT,2011.0,,4.0,,
1,Sunday,,2011-04,NaT,2011.0,,4.0,,
2,Tuesday,,2012-04,NaT,2012.0,,4.0,,
3,Thursday,,2014-04,NaT,2014.0,,4.0,,
4,Friday,,2015-04,NaT,2015.0,,4.0,,


In [28]:
df[df.days_difference.notnull()].head(3)

Unnamed: 0,airline_name,overall_rating,review_title,review_date,review,aircraft,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_e_beverages,ground_service,inflight_entertainment,wifi_e_connectivity,review_clean,review_title_clean,review_day_of_week,date_flown_day_of_week,review_year_month,date_flown_year_month,review_year,date_flown_year,review_month,date_flown_month,days_difference
6,Cubana Airlines,2,"""wouldn't recommend Cubana Airlines""",2016-04-10,"Flew from Varadero to Toronto April 8, 2016....",A320,Family Leisure,Economy Class,VRA to YYZ,2016-04-01,2.0,3.0,3.0,2.0,,,fly varadero toronto april flight sched...,not recommend cubana airline,Sunday,Friday,2016-04,2016-04,2016.0,2016.0,4.0,4.0,9.0
7,SilkAir,3,"""budget airline quality""",2016-04-10,Singapore to Kunming with SilkAir. Singapore...,,Couple Leisure,Economy Class,SIN to KMG,2015-12-01,1.0,4.0,1.0,5.0,1.0,,singapore kunme silkair singapore airlines ...,budget airline quality,Sunday,Tuesday,2016-04,2015-12,2016.0,2015.0,4.0,12.0,131.0
8,Air Berlin,1,"""worst service in business class""",2017-04-10,New York JFK to Duesseldorf. The worst servi...,,Couple Leisure,Business Class,New York JFK to Duesseldorf,2017-04-01,1.0,1.0,1.0,1.0,1.0,,new york jfk duesseldorf bad service busine...,bad service business class,Monday,Saturday,2017-04,2017-04,2017.0,2017.0,4.0,4.0,9.0


In [29]:
# Aplicação da função de categorização de rating
df['rating_category'] = df['overall_rating'].apply(categorize_rating)

df[['overall_rating', 'rating_category']].head()

Unnamed: 0,overall_rating,rating_category
0,8,Positivo
1,1,Negativo
2,9,Positivo
3,8,Positivo
4,7,Neutro


In [30]:
rating_mapping = {'Negativo': -1, 'Neutro': 0, 'Positivo': 1}
df['rating_category_numeric'] = df['rating_category'].map(rating_mapping)

df[['overall_rating', 'rating_category', 'rating_category_numeric']].head()

Unnamed: 0,overall_rating,rating_category,rating_category_numeric
0,8,Positivo,1
1,1,Negativo,-1
2,9,Positivo,1
3,8,Positivo,1
4,7,Neutro,0


In [31]:
df['full_review_text'] = df['review_title'].fillna('') + ' ' + df['review'].fillna('')
df.head(3)

Unnamed: 0,airline_name,overall_rating,review_title,review_date,review,aircraft,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_e_beverages,ground_service,inflight_entertainment,wifi_e_connectivity,review_clean,review_title_clean,review_day_of_week,date_flown_day_of_week,review_year_month,date_flown_year_month,review_year,date_flown_year,review_month,date_flown_month,days_difference,rating_category,rating_category_numeric,full_review_text
0,Air Seychelles,8,Air Seychelles customer review,2011-04-10,LHR-SEZ-LHR. It's true that there aren't perso...,,,Economy Class,,NaT,,,,,,,lhrsezlhr true not personal tv like virgin ba ...,air seychelle customer review,Sunday,,2011-04,NaT,2011.0,,4.0,,,Positivo,1,Air Seychelles customer review LHR-SEZ-LHR. It...
1,CityJet,1,CityJet customer review,2011-04-10,London City to Antwerp was delayed 3 hours 40 ...,,,Economy Class,,NaT,,,,,,,london city antwerp delay hour min sure tr...,cityjet customer review,Sunday,,2011-04,NaT,2011.0,,4.0,,,Negativo,-1,CityJet customer review London City to Antwerp...
2,Shanghai Airlines,9,Shanghai Airlines customer review,2012-04-10,PEK to HGH in Economy. Slight delay out of PEK...,,,Economy Class,,NaT,3.0,4.0,4.0,,2.0,,pek hgh economy slight delay pek uneventful fl...,shanghai airlines customer review,Tuesday,,2012-04,NaT,2012.0,,4.0,,,Positivo,1,Shanghai Airlines customer review PEK to HGH i...


In [32]:
delay_keywords = [
    r"\bdelay(ed|s)?\b",          # delay, delayed, delays
    r"\blate\b",                  # late
    r"\blateness\b",              # lateness
    r"\blong wait\b",             # long wait
    r"\bwaiting time\b",          # waiting time
    r"\bwait(ed|ing)?\b",         # wait, waited, waiting
    r"\bpostpone(d)?\b",          # postpone, postponed
    r"\breschedul(ed|ing|e)?\b",  # reschedule, rescheduled
    r"\bflight cancel(l)?ed\b"    # canceled or cancelled (US/UK spelling)
]

In [33]:
pattern = re.compile('|'.join(delay_keywords), flags=re.IGNORECASE)

df['is_delay'] = df['full_review_text'].apply(lambda x: 1 if pattern.search(x) else 0)

df.head(3)

Unnamed: 0,airline_name,overall_rating,review_title,review_date,review,aircraft,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,food_e_beverages,ground_service,inflight_entertainment,wifi_e_connectivity,review_clean,review_title_clean,review_day_of_week,date_flown_day_of_week,review_year_month,date_flown_year_month,review_year,date_flown_year,review_month,date_flown_month,days_difference,rating_category,rating_category_numeric,full_review_text,is_delay
0,Air Seychelles,8,Air Seychelles customer review,2011-04-10,LHR-SEZ-LHR. It's true that there aren't perso...,,,Economy Class,,NaT,,,,,,,lhrsezlhr true not personal tv like virgin ba ...,air seychelle customer review,Sunday,,2011-04,NaT,2011.0,,4.0,,,Positivo,1,Air Seychelles customer review LHR-SEZ-LHR. It...,0
1,CityJet,1,CityJet customer review,2011-04-10,London City to Antwerp was delayed 3 hours 40 ...,,,Economy Class,,NaT,,,,,,,london city antwerp delay hour min sure tr...,cityjet customer review,Sunday,,2011-04,NaT,2011.0,,4.0,,,Negativo,-1,CityJet customer review London City to Antwerp...,1
2,Shanghai Airlines,9,Shanghai Airlines customer review,2012-04-10,PEK to HGH in Economy. Slight delay out of PEK...,,,Economy Class,,NaT,3.0,4.0,4.0,,2.0,,pek hgh economy slight delay pek uneventful fl...,shanghai airlines customer review,Tuesday,,2012-04,NaT,2012.0,,4.0,,,Positivo,1,Shanghai Airlines customer review PEK to HGH i...,1


In [34]:
len(df[df['is_delay'] == 1]) / len(df) * 100

40.28905395289918

In [35]:
df.to_parquet('../data/processed/abt_airline_reviews.parquet', index=False)