# AIRBNB LONDON TEXT ANALYSIS FOR REVIEW DATA


## Data Wrangling

### Importing the necessary packages


In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
import nltk
#nltk.download('punkt')

### Read dataset¶


In [3]:
df = pd.read_csv('./listings.csv')
print(df.shape)
df.head(3)

(75241, 18)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,714569379355913481,Lovely private bedroom in Muswell Hill.,39009854,Forough,,Haringey,51.59728,-0.13933,Private room,100,1,0,,,1,365,0,
1,822557738577472503,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,325629338,Paul,,Harrow,51.60818,-0.2774,Entire home/apt,132,2,0,,,4,35,0,
2,4876550,Stunning Apartment 2 minutes walk to Tube Station,25087384,Joseph,,Barnet,51.602282,-0.193606,Entire home/apt,120,5,0,,,1,337,0,


### Viewing Dataset Coloumns

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75241 entries, 0 to 75240
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             75241 non-null  int64 
 1   neighbourhood  75241 non-null  object
 2   name           75210 non-null  object
 3   room_type      75241 non-null  object
 4   price          75241 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.9+ MB


In [4]:
#viewing every columns properly
count = 1
for i in df.columns:
  print(count, i)
  count += 1

1 id
2 name
3 host_id
4 host_name
5 neighbourhood_group
6 neighbourhood
7 latitude
8 longitude
9 room_type
10 price
11 minimum_nights
12 number_of_reviews
13 last_review
14 reviews_per_month
15 calculated_host_listings_count
16 availability_365
17 number_of_reviews_ltm
18 license


### Convert data types and remove irrelevant columns¶


In [29]:
#Counting NAN in each columns
print(df.shape)
df.loc[:,df.isnull().sum()>0].isnull().sum().sort_values(ascending=False)
df.columns


(75241, 5)


Index(['id', 'neighbourhood', 'name', 'room_type', 'price'], dtype='object')

In [5]:
# Choosing the only the essential columns for analysis. Also, Misleading datas of review was disgarded
df = df[['id','neighbourhood','name','room_type','price']]
#df = df[['id','description','zipcode']]
df

Unnamed: 0,id,neighbourhood,name,room_type,price
0,714569379355913481,Haringey,Lovely private bedroom in Muswell Hill.,Private room,100
1,822557738577472503,Harrow,PropertyPlug - 2Bed Flat in Edgware SmartTV WiFi,Entire home/apt,132
2,4876550,Barnet,Stunning Apartment 2 minutes walk to Tube Station,Entire home/apt,120
3,786791705194673775,Croydon,Waddon Coach House 2,Entire home/apt,100
4,808038970516277767,Barnet,Studio Flat Franklin London,Entire home/apt,65
...,...,...,...,...,...
75236,13609107,Tower Hamlets,Spacious Rooftop Room and Terrace.,Private room,25
75237,16840434,Tower Hamlets,Stylish flat in trendy East London. Bethnal Gr...,Entire home/apt,500
75238,9773320,Lambeth,Spacious room for a Lady in London,Private room,22
75239,15460327,Hackney,Entire house w Private Garden in cool East London,Entire home/apt,85


In [6]:
#Dataset checked for NaN values
print('Number of rows in each column affected by existence of non-existing values:')
df.isnull().sum()

Number of rows in each column affected by existence of non-existing values:


id                0
neighbourhood     0
name             31
room_type         0
price             0
dtype: int64

In [7]:
#loading detailed review dataset
review = pd.read_csv('./reviews.csv')
review.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,13913,80770,2010-08-18,177109,Michael,My girlfriend and I hadn't known Alina before ...
1,13913,367568,2011-07-11,19835707,Mathias,Alina was a really good host. The flat is clea...
2,13913,529579,2011-09-13,1110304,Kristin,Alina is an amazing host. She made me feel rig...
3,13913,595481,2011-10-03,1216358,Camilla,"Alina's place is so nice, the room is big and ..."
4,13913,612947,2011-10-09,490840,Jorik,"Nice location in Islington area, good for shor..."


In [8]:
## Choosing the only the essential columns for analysis 
review = review[['listing_id','comments']]
review

Unnamed: 0,listing_id,comments
0,13913,My girlfriend and I hadn't known Alina before ...
1,13913,Alina was a really good host. The flat is clea...
2,13913,Alina is an amazing host. She made me feel rig...
3,13913,"Alina's place is so nice, the room is big and ..."
4,13913,"Nice location in Islington area, good for shor..."
...,...,...
1352427,840878919107024852,"Great place to stay, enjoyed every min of the ..."
1352428,840878919107024852,"Was a great place to stay, felt home away from..."
1352429,841014552889907808,Natalie was extremely helpful responding to qu...
1352430,841114310668130061,Great place to stay; great balance between mon...


In [9]:
#Null comments in review are discarded 

reviewNAcomments=review[(review.comments.isnull())]
print(reviewNAcomments.shape)
review=review[~(review.comments.isnull())]

(106, 2)


In [10]:
 #group the dataframe by listing id and then bring all the comments to a particular listing_id to group according to the listing_id
review_group = review.groupby('listing_id')
print(review_group)
review = review_group.apply(lambda x: list(x['comments']))


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fda7c4c5810>


In [11]:
#Convert from series to dataframe
review = review.to_frame('comments')
review


Unnamed: 0_level_0,comments
listing_id,Unnamed: 1_level_1
13913,[My girlfriend and I hadn't known Alina before...
15400,[We loved staying at Phillipa's place in Chels...
17402,"[Amazing location - great apartment, plenty of..."
24328,[The communication with Joe was very easy. We ...
25123,"[10,000 times better and cheaper than staying ..."
...,...
843038973630044774,"[nice room, central area with subway near]"
843203613361079569,[Fantastic host.<br/>Great accommodation cant ...
843405768455690290,"[Raffaella’s flat is quite spacious, we were v..."
843524996925244561,[Superb stay & very convenient for access to w...


In [12]:
# merging full review and adding only specific columns from listing(df)
cleaned_df = pd.merge(left=review, right=df, how='left', left_on=review.index, right_on='id')
cleaned_df

Unnamed: 0,comments,id,neighbourhood,name,room_type,price
0,[My girlfriend and I hadn't known Alina before...,13913,Islington,Holiday London DB Room Let-on going,Private room,79
1,[We loved staying at Phillipa's place in Chels...,15400,Kensington and Chelsea,Bright Chelsea Apartment. Chelsea!,Entire home/apt,80
2,"[Amazing location - great apartment, plenty of...",17402,Westminster,Superb 3-Bed/2 Bath & Wifi: Trendy W1,Entire home/apt,418
3,[The communication with Joe was very easy. We ...,24328,Wandsworth,"Battersea live/work artist house, garden communal",Entire home/apt,250
4,"[10,000 times better and cheaper than staying ...",25123,Barnet,Clean big Room in London (Room 1),Private room,29
...,...,...,...,...,...,...
56543,"[nice room, central area with subway near]",843038973630044774,Tower Hamlets,Holybush D,Private room,39
56544,[Fantastic host.<br/>Great accommodation cant ...,843203613361079569,Hackney,Canal side cute room,Private room,37
56545,"[Raffaella’s flat is quite spacious, we were v...",843405768455690290,Kensington and Chelsea,Luxury Flat in Kensington / Knightsbridge,Entire home/apt,199
56546,[Superb stay & very convenient for access to w...,843524996925244561,Islington,¹ Central Private Room/Caledonian,Private room,39


In [13]:
#Cleaded dataset combining the listing and review saved to a seperate file named processed_airbnb.csv
cleaned_df.to_csv('processed_airbnb.csv')


#### Compare all the words remaining with the positive and negative words that are available based on dictionary file


In [15]:
import nltk
#stopwords = set(STOPWORDS) # STOPWORDS is a list with english common words that you should not count in the
                           # wordcloud, like prepositions and conjuctions
# nltk.download('stopwords')
# nltk.download('punkt')
# from nltk.corpus import stopwords
# stopWords = set(stopwords.words('english'))

#stopwords = set(STOPWORDS) # STOPWORDS is a list with english common words that you should not count in the
                           # wordcloud, like prepositions and conjuctions

In [1]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# Load positive and negative word lists from text files
with open("./positive.txt", "r") as f:
    positive_list = [line.strip() for line in f]

with open("./negative.txt", "r") as f:
    negative_list = [line.strip() for line in f]

# Define a function to check whether a comment is positive, negative or neutral
def positive_negative_checker(cleanW):
    positive_count = sum(1 for word in cleanW if word in positive_list)
    negative_count = sum(1 for word in cleanW if word in negative_list)
    
    if negative_count > positive_count:
        return -1
    elif positive_count > negative_count:
        return 1
    else:
        return 0

# Load stopwords from NLTK library
stopwords = set(stopwords.words('english'))

# Define a function to analyze comments and count positive, negative and neutral comments
def comment_analyser(sentence_list):
    positive_comment_count = 0
    neutral_comment_count = 0
    negative_comment_count = 0
    
    for sentence in sentence_list:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word.isalnum() and word not in stopwords]
        
        comment_value = positive_negative_checker(words)
        
        if comment_value == 1:
            positive_comment_count += 1
        elif comment_value == -1:
            negative_comment_count += 1
        else:
            neutral_comment_count += 1
    
    analyzed_comment = [positive_comment_count, neutral_comment_count, negative_comment_count]
    
    return analyzed_comment

# Read the cleaned Airbnb reviews dataframe
cleaned_df = pd.read_csv("cleaned_airbnb_reviews.csv")

# Add columns for positive, neutral and negative comments
cleaned_df["positive_comment"] = 0
cleaned_df["neutral_comment"] = 0
cleaned_df["negative_comment"] = 0

# Analyze comments and store results in the dataframe
for i, row in cleaned_df.iterrows():
    comments = row["comments"].split("|")
    analyzed_comment = comment_analyser(comments)
    
    cleaned_df.at[i, "positive_comment"] = analyzed_comment[0]
    cleaned_df.at[i, "neutral_comment"] = analyzed_comment[1]
    cleaned_df.at[i, "negative_comment"] = analyzed_comment[2]
    
    print(f"Processed row {i+1}/{len(cleaned_df)}")

# Save the updated dataframe to a new CSV file
cleaned_df.to_csv("airbnb_reviews_with_sentiment.csv", index=False)


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/shamsiyashams/nltk_data'
    - '/Users/shamsiyashams/anaconda3/nltk_data'
    - '/Users/shamsiyashams/anaconda3/share/nltk_data'
    - '/Users/shamsiyashams/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
