In [None]:
!pip install geoplot geopandas geodatasets researchpy feature-engine streamlit

In [None]:
!curl -L https://github.com/s91233/ds/blob/main/archive.zip?raw=true | jar xv

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt
from scipy.stats import probplot
from textblob import TextBlob
from tqdm.auto import tqdm
tqdm.pandas()
df = pd.read_csv('Airbnb_Open_Data.csv')
df.describe
df.info()
df.head()
df

In [None]:
for col in df.select_dtypes(include=np.number).columns:

    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    # QQ-plot
    probplot(df[col], plot=ax[0], fit=True)
    ax[0].set_title(f'QQ-Plot for {col}')

    # Histogram & PDF
    sns.histplot(df[col], kde=True, ax=ax[1])
    ax[1].set_title(f'Pdf & Histogram for {col}')

    plt.show()

In [None]:
# Clean & Convert

df = df.drop(columns=['id','NAME','host id','host name','country','country code','license'], axis=1)
df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)

df['last_review'] = pd.to_datetime(df['last_review'], format='%m/%d/%Y')

df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['service_fee'] = df['service_fee'].replace('[\$,]', '', regex=True).astype(float)

for column in df.columns:
    if column in ['price', 'service_fee', 'lat', 'long']:
        df[column].fillna(df[column].mean(), inplace=True)
    elif df[column].dtype == 'numeric':
        df[column].fillna(df[column].mean().round(0), inplace=True)
    else:
        df[column].fillna(df[column].mode()[0], inplace=True)
        df[column] = pd.Categorical(df[column])

df.dropna(inplace=True)

df

In [None]:
# Read the NYC Boro GeoDataFrame and ensure it's in EPSG:4326 (GPS)
nyc_boroughs = gpd.GeoDataFrame.from_file(gpd.datasets.get_path("nybb")).to_crs(epsg=4326)

# Create GeoDataFrame and set CRS to EPSG:4326
points = gpd.points_from_xy(df["long"], df["lat"])
gdf = gpd.GeoDataFrame(df, geometry=points, crs="EPSG:4326")

# Ensure geometries are valid
gdf["geometry"] = gdf["geometry"].make_valid()
nyc_boroughs["geometry"] = nyc_boroughs["geometry"].make_valid()

# Perform spatial join with 'within' predicate
gdf_joined = gpd.sjoin(gdf, nyc_boroughs, how="left", predicate="within")

# Replace mismatching neighbourhood_groups
gdf["correct_borough"] = gdf_joined["BoroName"]
incorrect_locations = df[gdf["neighbourhood_group"] != gdf["correct_borough"]]
display(incorrect_locations)
df["neighbourhood_group"] = gdf["correct_borough"]


In [None]:
# Ensure that "house_rules" column has string data type
df["house_rules"] = df["house_rules"].astype(str)

# Calculate sentiment polarity for each text
df["house_rules_sentiment"] = df["house_rules"].apply(
    lambda text: TextBlob(text).sentiment.polarity
)

# Categorize house rules based on sentiment scores
df["house_rules"] = df["house_rules_sentiment"].apply(
    lambda score: (
        "strict" if score < -0.2 else ("neutral" if -0.2 <= score <= 0.2 else "relaxed")
    )
)


In [None]:
df.to_csv('Airbnb_Open_Data_cleaned.csv', index=False)

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load your preprocessed data
df = pd.read_csv('Airbnb_Open_Data_cleaned.csv')

# Prepare your features and target variable
X = df[['price']]
y = df['service_fee']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Streamlit app
st.title("Service Fee Recommender")

# Input for price
price = st.number_input("Enter the price of the listing:", min_value=0.0)

# Make prediction when button is clicked
if st.button("Recommend Service Fee"):
    predicted_fee = model.predict([[price]])[0]
    st.success(f"Recommended service fee: ${predicted_fee:.2f}")

In [None]:
!npm install localtunnel -g

In [None]:
!curl icanhazip.com

In [None]:
!streamlit run app.py --server.address=localhost & npx localtunnel --port 8501

In [None]:
# Correlations
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap'); plt.show()
#sns.pairplot(df.select_dtypes(include=np.number)); plt.show()

In [None]:
import geopandas as gpd
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot, plot, download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go

nyc_boroughs = gpd.GeoDataFrame.from_file(gpd.datasets.get_path('nybb')).to_crs(epsg=4326)

fig = px.scatter_mapbox(
    df,
    lat= "lat",
    lon= "long",
    color="room_type",
    center={"lat": 40.7128, "lon": -74.0060},
    zoom=10,
    mapbox_style="carto-positron",
)
go.Figure(fig).add_choroplethmapbox(
    geojson=nyc_boroughs.geometry.__geo_interface__,
    locations=nyc_boroughs.index,
    z=nyc_boroughs.index,
    colorscale="Viridis",
    marker_opacity=0.5,
    marker_line_width=1,
    below="traces"
).show()

fig = px.scatter_mapbox(df, lat='lat', lon='long', zoom=10,
    color='neighbourhood_group',
    mapbox_style="carto-positron",
    center={"lat": 40.7128, "lon": -74.0060},
)
go.Figure(fig).add_choroplethmapbox(
    geojson=nyc_boroughs.geometry.__geo_interface__,
    locations=nyc_boroughs.index,
    z=nyc_boroughs.index,
    colorscale="Viridis",
    marker_opacity=0.5,
    marker_line_width=1,
    below="traces"
).show()

fig = px.scatter_mapbox(
        df,
        lat="lat",
        lon="long",
        color="house_rules",
        mapbox_style="carto-positron",
        center={"lat": 40.7128, "lon": -74.0060},
        zoom=9
)
go.Figure(fig).add_choroplethmapbox(
    geojson=nyc_boroughs.geometry.__geo_interface__,
    locations=nyc_boroughs.index,
    z=nyc_boroughs.index,
    colorscale="Viridis",
    marker_opacity=0.5,
    marker_line_width=1,
    below="traces"
).show()

fig = px.density_mapbox(
    df,
    lat="lat",
    lon="long",
    z="review_rate_number",
    radius=10,
    center={"lat": 40.7128, "lon": -74.0060},
    zoom=9,
    mapbox_style="carto-positron"
)
go.Figure(fig).add_choroplethmapbox(
    geojson=nyc_boroughs.geometry.__geo_interface__,
    locations=nyc_boroughs.index,
    z=nyc_boroughs.index,
    colorscale="Viridis",
    marker_opacity=0.5,
    marker_line_width=1,
    below="traces"
).show()

In [None]:
from wordcloud import WordCloud
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
                          background_color='white',
                          width=1920,
                          height=1080
                         ).generate(" ".join(df.neighbourhood))
plt.imshow(wordcloud)
plt.savefig('neighbourhood.png')
plt.axis('off')
plt.show()

In [None]:
print(0/0)

In [None]:
stratification_columns = ['neighbourhood_group', 'room_type', 'house_rules']

TARGET = "service_fee"
EPOCHS = 100
SPLITS = 10

import tensorflow as tf
from tqdm.auto import tqdm
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# Combine the specified columns into a single stratification column
df['stratification'] = df[stratification_columns].apply(lambda x: ''.join(x.astype(str)), axis=1)

X = df.drop([TARGET, 'stratification', 'last_review'], axis=1)

# Convert categorical columns to numerical labels
label_encoders = {}
for col in X.select_dtypes(include=['object','category']).columns:
  le = LabelEncoder()
  X[col] = le.fit_transform(X[col])
  label_encoders[col] = le

y = df[TARGET]  # Target variable

models = [
    LinearRegression(),
    MLPRegressor(random_state=42),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42)
]

skf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=42)
for model in models:
  print(f"Training {type(model).__name__}")
  for i, (train_index, test_index) in enumerate(skf.split(X, df['stratification']), start=1):
    print(f"Training fold {i} out of {skf.get_n_splits()}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    print(f"RMSE: {mean_squared_error(y_test, model.predict(X_test), squared=False)}")

# NN

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalize the numerical features
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(X_train))

model = keras.Sequential([
    normalizer,
    layers.Dense(1024, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1)
])

model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam())

print(f"Training {type(model).__name__}")
history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    validation_split=0.2,
    verbose=1)
models.append(model)
print(f"RMSE: {model.evaluate(X_test, y_test, verbose=0)}")

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

df = df.drop('stratification', axis=1)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
colors = plt.cm.rainbow(np.linspace(0, 1, len(models)))

for i, model in enumerate(models):
    y_pred = model.predict(X_test)
    # Handle the 2D output of the neural network
    if isinstance(y_pred, np.ndarray) and y_pred.ndim == 2:
        y_pred = y_pred.flatten()  # Flatten to 1D
    # Predictions
    axes[0].scatter(y_test, y_pred, alpha=0.5, label=type(model).__name__, color=colors[i])
    # Residuals
    axes[1].scatter(y_pred, y_test - y_pred, alpha=0.5, label=type(model).__name__, color=colors[i])

axes[0].legend()
axes[0].set_xlabel("Actual")
axes[0].set_ylabel("Predicted")
axes[0].set_title("Actual vs Predicted")

axes[1].legend()
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Residuals")
axes[1].set_title("Residuals vs Predicted")

plt.tight_layout()
plt.show()