<a href="https://colab.research.google.com/github/stutibimali/Webscrapping/blob/main/DataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pandas as pd
import os


In [2]:
#create a dataframe from csv file

cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | The incoming and outgoing f...,5,10th March 2023,United Kingdom
1,✅ Trip Verified | Back in December my family ...,1,10th March 2023,Australia
2,✅ Trip Verified | As usual the flight is dela...,1,10th March 2023,United Kingdom
3,✅ Trip Verified | A short BA euro trip and thi...,1,9th March 2023,United Kingdom
4,Not Verified | We are flying Business class f...,8,8th March 2023,United States


In [3]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0       True
1       True
2       True
3       True
4      False
       ...  
995     True
996     True
997     True
998     True
999     True
Name: verified, Length: 1000, dtype: bool

## Cleaning Reviews

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [7]:
#for lemmatization of words we will use nltk library
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [8]:
# add the corpus to the original dataframe

df['corpus'] = corpus

In [9]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | The incoming and outgoing f...,5,10th March 2023,United Kingdom,True,incoming outgoing flight delayed french air tr...
1,✅ Trip Verified | Back in December my family ...,1,10th March 2023,Australia,True,back december family getting onto plane refuse...
2,✅ Trip Verified | As usual the flight is dela...,1,10th March 2023,United Kingdom,True,usual flight delayed week already hour held bu...
3,✅ Trip Verified | A short BA euro trip and thi...,1,9th March 2023,United Kingdom,True,short ba euro trip ba excel clean aircraft goo...
4,Not Verified | We are flying Business class f...,8,8th March 2023,United States,False,verified flying business class flight premium ...


## Format Date

In [10]:
df.dtypes

reviews     object
stars        int64
date        object
country     object
verified      bool
corpus      object
dtype: object

In [11]:
# convert the date to datetime format

df.date = pd.to_datetime(df.date)

In [12]:
df.date.head()

0   2023-03-10
1   2023-03-10
2   2023-03-10
3   2023-03-09
4   2023-03-08
Name: date, dtype: datetime64[ns]

## Cleaning star

In [13]:
#check for unique values
df.stars.unique()

array([ 5,  1,  8,  2, 10,  4,  6,  7,  9,  3])

In [14]:
df.stars.value_counts()

1     305
2     111
10    104
3      96
8      80
9      80
5      61
4      60
7      60
6      43
Name: stars, dtype: int64

## Check for null

In [15]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     1000
dtype: int64

In [16]:
df.country.isnull().value_counts()

False    1000
Name: country, dtype: int64

In [17]:
#drop the rows using index where the country value is null
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [18]:
df.shape

(1000, 6)

In [19]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | The incoming and outgoing f...,5,2023-03-10,United Kingdom,True,incoming outgoing flight delayed french air tr...
1,✅ Trip Verified | Back in December my family ...,1,2023-03-10,Australia,True,back december family getting onto plane refuse...
2,✅ Trip Verified | As usual the flight is dela...,1,2023-03-10,United Kingdom,True,usual flight delayed week already hour held bu...
3,✅ Trip Verified | A short BA euro trip and thi...,1,2023-03-09,United Kingdom,True,short ba euro trip ba excel clean aircraft goo...
4,Not Verified | We are flying Business class f...,8,2023-03-08,United States,False,verified flying business class flight premium ...
...,...,...,...,...,...,...
995,✅ Trip Verified | This was an early morning f...,10,2018-05-18,Singapore,True,early morning flight heathrow keflavik flight ...
996,✅ Trip Verified | Cape Town to London. Dated ...,9,2018-05-17,United Kingdom,True,cape town london dated aircraft poor cabin ser...
997,✅ Trip Verified | Washington to Prague via Lo...,1,2018-05-14,United States,True,washington prague via london flown several tim...
998,✅ Trip Verified | London to Washington. Apart...,1,2018-05-14,France,True,london washington apart space nothing nothing ...


In [20]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")