In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r'BA_Feedbacks.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Reviews,Stars(Ratings),Date,Country
0,0,✅ Trip Verified | Booked a flight from Buchar...,5,12th November 2023,Romania
1,1,✅ Trip Verified | Booked online months ago an...,1,8th November 2023,United Kingdom
2,2,✅ Trip Verified | The flight was on time. The...,8,7th November 2023,United Kingdom
3,3,"Not Verified | Angry, disappointed, and unsat...",7,5th November 2023,Italy
4,4,"✅ Trip Verified | As an infrequent flyer, Bri...",2,5th November 2023,United Kingdom


In [3]:
df.shape

(4000, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      4000 non-null   int64 
 1   Reviews         4000 non-null   object
 2   Stars(Ratings)  4000 non-null   int64 
 3   Date            4000 non-null   object
 4   Country         4000 non-null   object
dtypes: int64(2), object(3)
memory usage: 156.4+ KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,4000.0,1999.5,1154.844867,0.0,999.75,1999.5,2999.25,3999.0
Stars(Ratings),4000.0,4.1065,3.098477,1.0,1.0,3.0,7.0,10.0


In [6]:
df.isnull().sum()

Unnamed: 0        0
Reviews           0
Stars(Ratings)    0
Date              0
Country           0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.drop(columns = 'Unnamed: 0', inplace = True, axis = 1)

In [9]:
df

Unnamed: 0,Reviews,Stars(Ratings),Date,Country
0,✅ Trip Verified | Booked a flight from Buchar...,5,12th November 2023,Romania
1,✅ Trip Verified | Booked online months ago an...,1,8th November 2023,United Kingdom
2,✅ Trip Verified | The flight was on time. The...,8,7th November 2023,United Kingdom
3,"Not Verified | Angry, disappointed, and unsat...",7,5th November 2023,Italy
4,"✅ Trip Verified | As an infrequent flyer, Bri...",2,5th November 2023,United Kingdom
...,...,...,...,...
3995,✅ Verified Review | Lisbon to London with Brit...,3,14th November 2016,Portugal
3996,✅ Verified Review | The flight started in the...,1,13th November 2016,United Kingdom
3997,✅ Verified Review | The flight started badly ...,1,12th November 2016,United Kingdom
3998,Gatwick to Alicante. Crew friendly but a bit a...,9,11th November 2016,United Kingdom


In [10]:
df.dtypes

Reviews           object
Stars(Ratings)     int64
Date              object
Country           object
dtype: object

In [11]:
len(df['Country'].value_counts())

66

In [12]:
result = df.groupby('Date')['Country'].value_counts().reset_index(name = 'Count')
result = result.sort_values(by = ['Date', 'Count'], ascending = [True, False])
print(result[:30])

                  Date         Country  Count
0      10th April 2017       Australia      2
1      10th April 2017  United Kingdom      2
2      10th April 2018  United Kingdom      2
3      10th April 2019  United Kingdom      2
4      10th April 2023  United Kingdom      2
5     10th August 2019       Australia      2
6     10th August 2022   United States      2
7   10th December 2016           China      2
8   10th December 2016  United Kingdom      2
9   10th December 2017  United Kingdom      4
10  10th December 2018        Malaysia      2
11  10th December 2019         Ireland      2
12  10th December 2022  United Kingdom      4
13  10th February 2017           Spain      2
14  10th February 2018  United Kingdom      2
15  10th February 2019  United Kingdom      2
16  10th February 2021        Bulgaria      2
17   10th January 2017  United Kingdom      4
18   10th January 2019   United States      4
19   10th January 2022    South Africa      2
20      10th July 2017  United Kin

In [13]:
df.rename(columns = {'Stars(Ratings)' : 'Ratings'}, inplace = True)

# Now let's Clean the Reviews column

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

df_corpus = []

for i in range(0, 4000):
    review = re.sub('[^a-zA-z]', ' ', df['Reviews'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    df_corpus.append(review)

In [15]:
df_new = pd.DataFrame(df_corpus)

In [16]:
df_new.rename(columns = {0 : 'New_Reviews'}, inplace = True)

In [17]:
df_new

Unnamed: 0,New_Reviews
0,trip verifi book flight bucharest manchest h l...
1,trip verifi book onlin month ago hitch replac ...
2,trip verifi flight time crew polit stori outwa...
3,not verifi angri disappoint unsatisfi rout lon...
4,trip verifi infrequ flyer british airway alway...
...,...
3995,verifi review lisbon london british airway nic...
3996,verifi review flight start first class concord...
3997,verifi review flight start badli call new conc...
3998,gatwick alicant crew friendli bit amateur busi...


In [18]:
df = pd.concat([df, df_new], axis=1)
df

Unnamed: 0,Reviews,Ratings,Date,Country,New_Reviews
0,✅ Trip Verified | Booked a flight from Buchar...,5,12th November 2023,Romania,trip verifi book flight bucharest manchest h l...
1,✅ Trip Verified | Booked online months ago an...,1,8th November 2023,United Kingdom,trip verifi book onlin month ago hitch replac ...
2,✅ Trip Verified | The flight was on time. The...,8,7th November 2023,United Kingdom,trip verifi flight time crew polit stori outwa...
3,"Not Verified | Angry, disappointed, and unsat...",7,5th November 2023,Italy,not verifi angri disappoint unsatisfi rout lon...
4,"✅ Trip Verified | As an infrequent flyer, Bri...",2,5th November 2023,United Kingdom,trip verifi infrequ flyer british airway alway...
...,...,...,...,...,...
3995,✅ Verified Review | Lisbon to London with Brit...,3,14th November 2016,Portugal,verifi review lisbon london british airway nic...
3996,✅ Verified Review | The flight started in the...,1,13th November 2016,United Kingdom,verifi review flight start first class concord...
3997,✅ Verified Review | The flight started badly ...,1,12th November 2016,United Kingdom,verifi review flight start badli call new conc...
3998,Gatwick to Alicante. Crew friendly but a bit a...,9,11th November 2016,United Kingdom,gatwick alicant crew friendli bit amateur busi...


In [19]:
df.to_csv('Cleaned BA Reviews.csv')

In [20]:
df.drop(columns = 'Reviews', inplace = True, axis = 1)

In [21]:
df

Unnamed: 0,Ratings,Date,Country,New_Reviews
0,5,12th November 2023,Romania,trip verifi book flight bucharest manchest h l...
1,1,8th November 2023,United Kingdom,trip verifi book onlin month ago hitch replac ...
2,8,7th November 2023,United Kingdom,trip verifi flight time crew polit stori outwa...
3,7,5th November 2023,Italy,not verifi angri disappoint unsatisfi rout lon...
4,2,5th November 2023,United Kingdom,trip verifi infrequ flyer british airway alway...
...,...,...,...,...
3995,3,14th November 2016,Portugal,verifi review lisbon london british airway nic...
3996,1,13th November 2016,United Kingdom,verifi review flight start first class concord...
3997,1,12th November 2016,United Kingdom,verifi review flight start badli call new conc...
3998,9,11th November 2016,United Kingdom,gatwick alicant crew friendli bit amateur busi...


So, here is the new reviews as you can see all the punctuations, tick marks, etc has been removed now it's good to go

In [22]:
df.to_csv('Perfect BA Reviews.csv')