### Business Understanding

**Question 1: How does the location of an Airbnb listing impact its price?**

**Question 2: When are listings most frequently booked?**

**Question 3: Are there seasonal trends or specific events that drive demand?**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

### Gather Data

In [None]:
listings_df = pd.read_csv("listings.csv")
calendar_df = pd.read_csv("calendar.csv")
reviews_df = pd.read_csv("reviews.csv")


### Understanding the Data

In [None]:
# Print the first 5 rows
listings_df.head()

In [None]:
# Print The Dataset shape
listings_df.shape

In [None]:
listings_df.describe()

In [None]:
listings_df.info()

### Missing Value Analysis

In [None]:
# Analyze missing values for each column and plot in a graph

#find percentage of missing values for each column
listings_missing_df = listings_df.isnull().mean()*100

#filter out only columns, which have missing values
listings_columns_with_nan = listings_missing_df[listings_missing_df > 0]

#plot the results
listings_columns_with_nan.plot.bar(title='Missing values per column, %')

### Clean Data


In [None]:
# Identify duplicate columns
duplicate_columns = listings_df.columns[listings_df.columns.duplicated()]

# Drop duplicate columns
listings_df = listings_df.drop(columns=duplicate_columns)

In [None]:
# Drop columns with full NA
listings_df.dropna(axis=1, how='all', inplace=True)

In [None]:
# Drop columns with only one unique value
listings_df.drop([c for c in listings_df.columns if listings_df[c].nunique()==1], axis=1, inplace=True)

In [None]:
# Drop columns representing url
listings_df.drop(listings_df.columns[listings_df.columns.str.contains("url")], axis=1, inplace=True)

In [None]:
def convert_price_to_float(df, column_name):
    """
    Converts the price column in the given DataFrame to float.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the price column.
    column_name (str): The name of the price column.

    Returns:
    pandas.DataFrame: The DataFrame with the price column converted to float.
    """
    df[column_name] = df[column_name].str.replace(r"$", "").str.replace(",", "").astype("float32")
    return df

# Call the function for each column
listings_df = convert_price_to_float(listings_df, 'price')
listings_df = convert_price_to_float(listings_df, 'weekly_price')
listings_df = convert_price_to_float(listings_df, 'monthly_price')
listings_df = convert_price_to_float(listings_df, 'security_deposit')
listings_df = convert_price_to_float(listings_df, 'cleaning_fee')
listings_df = convert_price_to_float(listings_df, 'extra_people')

In [None]:
# Dropping specified columns with high missing values
columns_to_drop = [
    'square_feet', 'summary', 'space', 'neighborhood_overview', 'notes', 'transit','license'
]

# Dropping host-related information (selecting by pattern)
host_related_columns = listings_df.columns[listings_df.columns.str.contains('^host_')]
columns_to_drop.extend(host_related_columns)

# Dropping the columns
listings_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
# Listing the numerical and categorical columns
numerical_columns = listings_df.select_dtypes(exclude=object).columns.tolist()
categorical_columns = listings_df.select_dtypes(include=object).columns.tolist()

In [None]:
# Dealing With Null Values
# Numeric columns: Use median imputation
numeric_imputer = SimpleImputer(strategy='median')
listings_df[numerical_columns] = numeric_imputer.fit_transform(listings_df[numerical_columns])

# Categorical columns with mode imputation
categorical_imputer = SimpleImputer(strategy='most_frequent')
listings_df[categorical_columns] = categorical_imputer.fit_transform(listings_df[categorical_columns])

### Analyze Data

**Basic Statistics**

In [None]:

# Summary statistics for price
listings_df['price'].describe()

In [None]:
# Average number of bedrooms
listings_df['bedrooms'].mean()

In [None]:
quantiles = listings_df['price'].quantile([0.25, 0.5, 0.75])
print(quantiles)

In [None]:
# Average number of bathrooms
listings_df['bathrooms'].mean()

In [None]:
# Average review scores rating
listings_df['review_scores_rating'].mean()

In [None]:
# Estimating occupancy rates
average_annual_availability = listings_df['availability_365'].mean()
estimated_annual_occupancy_rate = 100 - (average_annual_availability / 365 * 100)
estimated_annual_occupancy_rate

In [None]:
# Distribution of Property Types
sns.set_style("darkgrid")  # Set the aesthetic style of the plots

# Calculate teh distribution of the property types
property_type_counts = listings_df['property_type'].value_counts()

# Create a bar chart for the distribution of propery types
plt.figure(figsize=(10, 8))
sns.barplot(x=property_type_counts.values, y=property_type_counts.index, palette="colorblind")
plt.title("Distribution of Property Types")
plt.xlabel("Number of Listings")
plt.ylabel("Propery Type")
plt.show()

In [None]:
# Distribution of Neighborhoods
neighborhood_counts = listings_df['neighbourhood_group_cleansed'].value_counts().head(10) # Calculate the distribution of listings by neighborhood

# Create a bar chart for the top neighborhoods with the most listings
plt.figure(figsize=(10, 8))
sns.barplot(x=neighborhood_counts.values, y=neighborhood_counts.index, palette="coolwarm")
plt.title('Top 10 Neighborhoods by Number of Listings')
plt.xlabel('Number of Listings')
plt.ylabel('Neighborhood')
plt.show()

In [None]:
# Price Distribution
plt.figure(figsize=(12, 6))
sns.histplot(listings_df['price'], bins=50, kde=True, color="skyblue")
plt.title('Distribution of Listing Prices')
plt.xlabel('Price ($)')
plt.ylabel('Number of Listings')
plt.xlim(0, listings_df['price'].quantile(0.95))  # Limiting x-axis to 95th percentile for better visualization
plt.show()

In [None]:
# Room Type Preferences
plt.figure(figsize=(10, 6))
sns.countplot(data=listings_df, y='room_type', order=listings_df['room_type'].value_counts().index)
plt.title('Room Type Preferences')
plt.xlabel('Number of Listings')
plt.ylabel('Room Type')
plt.show()

In [None]:
# Number of Listings by Neighborhood
plt.figure(figsize=(10, 8))
sns.countplot(data=listings_df, y='neighbourhood_group_cleansed', order=listings_df['neighbourhood_group_cleansed'].value_counts().index,palette='viridis')
plt.title('Number of Listings by Neighborhood')
plt.xlabel('Number of Listings')
plt.ylabel('Neighborhood')
plt.show()

In [None]:
# Price by Room Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='room_type', y='price', data=listings_df)
plt.title('Price Distribution by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Price ($)')
plt.ylim(0, listings_df['price'].quantile(0.95))  # Limiting y-axis to 95th percentile for better visualization
plt.show()




In [None]:
# Availability by Neighborhood
plt.figure(figsize=(12, 8))
sns.boxenplot(y='neighbourhood_group_cleansed', x='availability_365', data=listings_df,palette='viridis')
plt.title('Availability by Neighborhood')
plt.xlabel('Availability (Days out of 365)')
plt.ylabel('Neighborhood')
plt.show()

In [None]:
# Review Scores Rating Distribution
plt.figure(figsize=(12, 6))
sns.histplot(listings_df['review_scores_rating'], bins=20, kde=True, color='green')
plt.title('Distribution of Review Scores Rating')
plt.xlabel('Review Scores Rating')
plt.ylabel('Number of Listings')
plt.xlim(0, 100)  # Review scores are typically on a scale from 0 to 100
plt.show()

**Correlatioon Analysis**

In [None]:
# Select relevant numerical columns for correlation analysis
numerical_columns = [
    'price', 'bedrooms', 'bathrooms', 'accommodates',
    'minimum_nights', 'maximum_nights', 'number_of_reviews',
    'review_scores_rating', 'availability_365'
]
# but all of these features are in the object type, so we need to convert them
for col in numerical_columns:
    listings_df[col] = pd.to_numeric(listings_df[col], errors='coerce')

# Calculate the correlation matrix
corr_matrix = listings_df[numerical_columns].corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Selected Features')
plt.show()