## **Problem Statement :** [link](https://docs.google.com/document/d/1SYlU0Wq4Ay-z_CTU3qviTwZd_eDp0vIB/edit?usp=sharing&ouid=117648558434810261022&rtpof=true&sd=true)

# **LinkedIn :**  [link](https://www.linkedin.com/in/praveen-n-2b4004223/)

# **Required Library**

In [102]:
pip install pymongo

In [None]:
import pymongo as pm
import pandas as pd

# **Mongo Db Connectivity**

In [None]:
Mongo = pm.MongoClient('mongodb+srv://praveen:praveenroot@praveen21.lsdge0t.mongodb.net/?retryWrites=true&w=majority')

# **Database & Collection Created**

In [None]:
db = Mongo['sample_airbnb']

collection = db['listingsAndReviews']

# **Total Documents**

In [103]:
Documents = [i for i in collection.find()]
len(Documents)

5555

# **Feature Understanding**



```
# Id: Unique identifier for each listing.

# Name: The title or name of the listing.

# Host Id: Unique identifier for the host of the listing.

# Host Name: The name of the host.

# Neighbourhood: The specific neighborhood where the listing is located.

# Latitude: The latitude coordinate of the listing's location.

# Longitude: The longitude coordinate of the listing's location.

# Room Type: The type of room being listed (e.g., "Entire home/apartment", "Private room", "Shared room").

# Price: The price to rent the listing.

# Minimum Nights: The minimum number of nights a guest must book for the listing.

# Maximum Nights: The Maximun number of nights a guest must book for the listing.

# Number Of Reviews: The total number of reviews the listing has received.

# Availability 30: The number of days the listing is available for booking within the next 30 days

# Availability 60: The number of days the listing is available for booking within the next 60 days

# Availability 90: The number of days the listing is available for booking within the next 90 days

# Availability 365: The number of days the listing is available for booking within the next 365 days

# Rating : ratings given by guests to the listings.
```



# **Collecting Relevant Data**


In [110]:
res = [i for i in collection.find({},{'_id':1,'name':1,'description':1,'host.host_id':1,'host.host_name':1,'host.host_neighbourhood':1,'address.location.coordinates':1,'price':1, 'availability.availability_30':1 , 'availability.availability_60':1 ,'availability.availability_90':1 ,'availability.availability_365':1,'room_type':1,'minimum_nights':1,'maximum_nights':1 ,'number_of_reviews':1,'host.host_total_listings_count':1,'review_scores.review_scores_rating':1,'review':1,'amenities':1,'property_type':1})]

In [111]:
data = {'Id':[],'Name':[],'Description':[],'Property Type':[],'Room Type':[],'Minimum Nights':[],'Maximum Nights':[],'Number Of Reviews':[],'Amenities':[],'Price':[],'Host ID':[],'Host Name':[],'Host Neighbourhood':[],'Host Total Listings Count':[],'Longitide':[],'Latitude':[],'Availability 30':[],'Availability 60':[],'Availability 90':[],'Availability 365':[],'Rating':[]}

In [108]:
data

{'Id': [],
 'Name': [],
 'Description': [],
 'Property Type': [],
 'Room Type': [],
 'Minimum Nights': [],
 'Maximum Nights': [],
 'Number Of Reviews': [],
 'Amenities': [],
 'Price': [],
 'Host ID': [],
 'Host Name': [],
 'Host Neighbourhood': [],
 'Host Total Listings Count': [],
 'Longitide': [],
 'Latitude': [],
 'Availability 30': [],
 'Availability 60': [],
 'Availability 90': [],
 'Availability 365': [],
 'Rating': []}

# **Preprocessing**

In [112]:
for i in res:

  # Id
  data['Id'].append(i['_id'])

  # Name
  data['Name'].append(i['name'])

  # Description
  data['Description'].append(i['description'])

  # Property Type
  data['Property Type'].append(i['property_type'])

  # Room Type
  data['Room Type'].append(i['room_type'])

  # Minimun Nights
  data['Minimum Nights'].append(i['minimum_nights'])

  # Maximun Nights
  data['Maximum Nights'].append(i['maximum_nights'])

  # Number Reviews
  data['Number Of Reviews'].append(i['number_of_reviews'])

  # Amenities
  data['Amenities'].append(i['amenities'])

  # Price
  data['Price'].append(i['price'])

  # Host details
  data['Host ID'].append(i['host']['host_id'])
  data['Host Name'].append(i['host']['host_name'])
  data['Host Neighbourhood'].append(i['host']['host_neighbourhood'])
  data['Host Total Listings Count'].append(i['host']['host_total_listings_count'])

  # Longitude and Latitude
  data['Longitide'].append(i['address']['location']['coordinates'][0])
  data['Latitude'].append(i['address']['location']['coordinates'][1])

  # Availability
  data['Availability 30'].append(i['availability']['availability_30'])
  data['Availability 60'].append(i['availability']['availability_60'])
  data['Availability 90'].append(i['availability']['availability_90'])
  data['Availability 365'].append(i['availability']['availability_365'])

  # Rating  'review_scores': {'review_scores_rating': 100}}
  data['Rating'].append(i['review_scores']['review_scores_rating'] if 'review_scores_rating' in i['review_scores'] else 0 )

# **Converting Data Into Sturctured Format**

In [None]:
pd.set_option('display.max_columns',None)

df = pd.DataFrame(data)

# **Empty String Value Replace**

In [166]:
df.columns

Index(['Id', 'Name', 'Description', 'Property Type', 'Room Type',
       'Minimum Nights', 'Maximum Nights', 'Number Of Reviews', 'Amenities',
       'Price', 'Host ID', 'Host Name', 'Host Neighbourhood',
       'Host Total Listings Count', 'Longitide', 'Latitude', 'Availability 30',
       'Availability 60', 'Availability 90', 'Availability 365', 'Rating'],
      dtype='object')

In [None]:
df['Description'] = df.Description.apply(lambda x : 'not mentioned' if x =='' else x )

In [138]:
df['Name'] = df.Name.apply(lambda x : 'not mentioned' if x =='' else x )

In [159]:
df['Host Neighbourhood'] = df['Host Neighbourhood'].replace("",method='ffill')

In [165]:
df.isnull().sum()

Id                           0
Name                         0
Description                  0
Property Type                0
Room Type                    0
Minimum Nights               0
Maximum Nights               0
Number Of Reviews            0
Amenities                    0
Price                        0
Host ID                      0
Host Name                    0
Host Neighbourhood           0
Host Total Listings Count    0
Longitide                    0
Latitude                     0
Availability 30              0
Availability 60              0
Availability 90              0
Availability 365             0
Rating                       0
dtype: int64

# **Save "Airbnb" Data Csv File**

In [167]:
df.to_csv('Airbnb.csv',index=False)



```
                                                       Completed
```

