### Biblioteki

In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import zstandard as zstd
import io

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score

from torch.utils.data import TensorDataset, DataLoader

### Finałowy dataset

In [7]:
def create_dataframe_from_csv_zst(filepath: str) -> pd.DataFrame:
    dctx = zstd.ZstdDecompressor()
    with open(filepath, 'rb') as compressed:
        with dctx.stream_reader(compressed) as reader:
            decompressed = io.TextIOWrapper(reader, encoding='utf-8')
            df = pd.read_csv(
                decompressed,
                low_memory=True
            )
    return df

In [33]:
AMENITIES_PATH = "../data/processed/amenities_stats.csv.zst"
REVIEWS_STATISTICS_PATH = "../data/processed/merged_reviews_statistics.csv.zst"
LISTINGS_PATH = "../data/raw/v2/listings.csv.zst"
SESSIONS_STATISTICS_PATH = "../data/processed/listings_statistics.csv"

In [10]:
amenities_df = create_dataframe_from_csv_zst(AMENITIES_PATH)
reviews_statistics_df = create_dataframe_from_csv_zst(REVIEWS_STATISTICS_PATH)
listings_df = create_dataframe_from_csv_zst(LISTINGS_PATH)

  df = pd.read_csv(


In [11]:
amenities_df.head(5)

Unnamed: 0,id,amenities,standardized_amenities,num_of_amenities,num_of_other_amenities,has_wifi,has_air_conditioning,num_of_top_10_common_amenities,mean_embedding
0,27262,"[""Heating - split type ductless system"", ""Dish...","['OTHER', 'OTHER', 'OTHER', 'OTHER', 'Refriger...",37,25,True,False,9,[-6.15361333e-02 -5.01407236e-02 -3.76954190e-...
1,809874,"[""Host greets you"", ""Dishes and silverware"", ""...","['OTHER', 'OTHER', 'Washer', 'TV', 'OTHER', 'R...",21,12,True,True,7,[-5.95615655e-02 -3.39257978e-02 -3.55266146e-...
2,866381,"[""Coffee"", ""Dishes and silverware"", ""Washer"", ...","['Coffee maker', 'OTHER', 'Washer', 'Condition...",49,29,True,True,12,[-5.62126637e-02 -3.99859361e-02 -3.53127755e-...
3,886724,"[""Host greets you"", ""Luggage dropoff allowed"",...","['OTHER', 'OTHER', 'Washer', 'OTHER', 'OTHER',...",20,13,True,True,6,[-5.84196560e-02 -4.62852195e-02 -3.04865874e-...
4,896212,"[""Host greets you"", ""Coffee"", ""Dishes and silv...","['OTHER', 'Coffee maker', 'OTHER', 'OTHER', 'O...",45,31,True,True,9,[-5.81393391e-02 -4.77810428e-02 -4.26256545e-...


In [12]:
reviews_statistics_df.head(5)

Unnamed: 0,listing_id,total_reviews,total_english_reviews,count_negative_english,count_positive_english
0,21853,33,17.0,1.0,16.0
1,27262,29,17.0,0.0,17.0
2,30320,172,133.0,15.0,118.0
3,30959,8,4.0,0.0,4.0
4,33945,78,60.0,2.0,58.0


In [13]:
listings_df.head(5)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,availability_eoy,number_of_reviews_ly,estimated_occupancy_l365d,estimated_revenue_l365d
0,27262,https://www.nocarz.pl/rooms/27262,20241225065837,2024-12-25,city scrape,"AQA-No7, Great mattress, high speed internet",THE MATTRESS - KING KOIL - Camden Luxury 160x2...,,https://a0.muscache.com/pictures/miso/Hosting-...,37177,...,t,1,1,0,0,0.19,,,,
1,809874,https://www.nocarz.pl/rooms/809874,20241225065837,2024-12-25,city scrape,Wonderfull Penthouse!!,,,https://a0.muscache.com/pictures/11813063/7d06...,4259738,...,f,1,1,0,0,0.68,,,,
2,866381,https://www.nocarz.pl/rooms/866381,20241225065837,2024-12-25,city scrape,Acropolis View Funky House,Welcome to a colorful 7th-floor penthouse in N...,,https://a0.muscache.com/pictures/hosting/Hosti...,4551671,...,f,1,1,0,0,0.28,,,,
3,886724,https://www.nocarz.pl/rooms/886724,20241225065837,2024-12-25,city scrape,Luxury Boutique Appartment -Athens,,The apartment is very conveniently located for...,https://a0.muscache.com/pictures/14659524/b33a...,4700824,...,f,1,1,0,0,0.14,,,,
4,896212,https://www.nocarz.pl/rooms/896212,20241225065837,2024-12-25,city scrape,Living like in a cottage in the center of Athens,An oasis of calm in the centre of a crowded ci...,Exarchia is considered the bohemian and altern...,https://a0.muscache.com/pictures/prohost-api/H...,4777984,...,t,13,13,0,0,3.58,,,,


In [20]:
listings_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [28]:
interesting_columns = [
    'id',
    'host_is_superhost',
    'host_verifications',
    'host_acceptance_rate',
    'neighbourhood_cleansed',
    'latitude',
    'longitude',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'price',
    'license',
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'instant_bookable',
    'reviews_per_month',
    'availability_eoy',
    'number_of_reviews_ly'
]

In [29]:
final_df = listings_df[interesting_columns].copy()
final_df.head(5)

Unnamed: 0,id,host_is_superhost,host_verifications,host_acceptance_rate,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,availability_eoy,number_of_reviews_ly
0,27262,t,"['email', 'phone']",100%,ΑΜΠΕΛΟΚΗΠΟΙ,37.98924,23.765,Entire rental unit,Entire home/apt,2,...,4.89,4.9,4.86,4.97,4.75,4.71,t,0.19,,
1,809874,f,"['email', 'phone']",96%,ΚΟΥΚΑΚΙ-ΜΑΚΡΥΓΙΑΝΝΗ,37.96215,23.72179,Entire rental unit,Entire home/apt,4,...,4.97,4.97,4.89,4.92,4.86,4.91,f,0.68,,
2,866381,t,"['email', 'phone', 'work_email']",97%,ΝΕΟΣ ΚΟΣΜΟΣ,37.95523,23.72443,Entire condo,Entire home/apt,3,...,4.98,4.98,4.93,4.93,4.73,4.9,f,0.28,,
3,886724,f,['email'],0%,ΚΥΨΕΛΗ,37.99745,23.73973,Entire rental unit,Entire home/apt,2,...,4.88,4.47,4.88,4.89,4.28,4.72,f,0.14,,
4,896212,t,"['email', 'phone']",100%,ΜΟΥΣΕΙΟ-ΕΞΑΡΧΕΙΑ-ΝΕΑΠΟΛΗ,37.98844,23.73845,Entire rental unit,Entire home/apt,2,...,4.94,4.96,4.97,4.93,4.82,4.93,t,3.58,,


In [35]:
listings_statistics = pd.read_csv(SESSIONS_STATISTICS_PATH)
listings_statistics.head(5)

Unnamed: 0,listing_id,total_bookings,total_duration,avg_duration,min_duration,max_duration,num_of_short_stays,num_of_long_stays
0,21853.0,33,48,1.454545,1,4,33,0
1,27262.0,29,144,4.965517,2,7,29,0
2,30320.0,172,360,2.093023,1,4,172,0
3,30959.0,8,12,1.5,1,3,8,0
4,33945.0,78,540,6.923077,3,10,54,24


In [40]:
listings_statistics['target'] = np.where(
    listings_statistics['num_of_short_stays'] > listings_statistics['num_of_long_stays'],
    'short',
    'long'
)

In [42]:
listings_statistics.head()

Unnamed: 0,listing_id,total_bookings,total_duration,avg_duration,min_duration,max_duration,num_of_short_stays,num_of_long_stays,target
0,21853.0,33,48,1.454545,1,4,33,0,short
1,27262.0,29,144,4.965517,2,7,29,0,short
2,30320.0,172,360,2.093023,1,4,172,0,short
3,30959.0,8,12,1.5,1,3,8,0,short
4,33945.0,78,540,6.923077,3,10,54,24,short


In [43]:
listing_statistics_final_df = listings_statistics[['listing_id','total_bookings', 'target']].copy()

In [45]:
listing_statistics_final_df.head(5)

Unnamed: 0,listing_id,total_bookings,target
0,21853.0,33,short
1,27262.0,29,short
2,30320.0,172,short
3,30959.0,8,short
4,33945.0,78,short


### Złączenie datasetów

In [46]:
final_df = final_df.merge(
    listing_statistics_final_df,
    left_on='id',
    right_on='listing_id',
    how='left'
)
final_df.head(5)

Unnamed: 0,id,host_is_superhost,host_verifications,host_acceptance_rate,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,availability_eoy,number_of_reviews_ly,listing_id,total_bookings,target
0,27262,t,"['email', 'phone']",100%,ΑΜΠΕΛΟΚΗΠΟΙ,37.98924,23.765,Entire rental unit,Entire home/apt,2,...,4.97,4.75,4.71,t,0.19,,,27262.0,29.0,short
1,809874,f,"['email', 'phone']",96%,ΚΟΥΚΑΚΙ-ΜΑΚΡΥΓΙΑΝΝΗ,37.96215,23.72179,Entire rental unit,Entire home/apt,4,...,4.92,4.86,4.91,f,0.68,,,809874.0,97.0,short
2,866381,t,"['email', 'phone', 'work_email']",97%,ΝΕΟΣ ΚΟΣΜΟΣ,37.95523,23.72443,Entire condo,Entire home/apt,3,...,4.93,4.73,4.9,f,0.28,,,866381.0,40.0,long
3,886724,f,['email'],0%,ΚΥΨΕΛΗ,37.99745,23.73973,Entire rental unit,Entire home/apt,2,...,4.89,4.28,4.72,f,0.14,,,886724.0,18.0,short
4,896212,t,"['email', 'phone']",100%,ΜΟΥΣΕΙΟ-ΕΞΑΡΧΕΙΑ-ΝΕΑΠΟΛΗ,37.98844,23.73845,Entire rental unit,Entire home/apt,2,...,4.93,4.82,4.93,t,3.58,,,896212.0,515.0,long


In [47]:
final_df= final_df.merge(
    reviews_statistics_df,
    left_on='id',
    right_on='listing_id',
    how='left'
)
final_df.head(5)

Unnamed: 0,id,host_is_superhost,host_verifications,host_acceptance_rate,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,availability_eoy,number_of_reviews_ly,listing_id_x,total_bookings,target,listing_id_y,total_reviews,total_english_reviews,count_negative_english,count_positive_english
0,27262,t,"['email', 'phone']",100%,ΑΜΠΕΛΟΚΗΠΟΙ,37.98924,23.765,Entire rental unit,Entire home/apt,2,...,,,27262.0,29.0,short,27262.0,29.0,17.0,0.0,17.0
1,809874,f,"['email', 'phone']",96%,ΚΟΥΚΑΚΙ-ΜΑΚΡΥΓΙΑΝΝΗ,37.96215,23.72179,Entire rental unit,Entire home/apt,4,...,,,809874.0,97.0,short,809874.0,97.0,82.0,0.0,82.0
2,866381,t,"['email', 'phone', 'work_email']",97%,ΝΕΟΣ ΚΟΣΜΟΣ,37.95523,23.72443,Entire condo,Entire home/apt,3,...,,,866381.0,40.0,long,866381.0,40.0,31.0,0.0,31.0
3,886724,f,['email'],0%,ΚΥΨΕΛΗ,37.99745,23.73973,Entire rental unit,Entire home/apt,2,...,,,886724.0,18.0,short,886724.0,18.0,17.0,1.0,16.0
4,896212,t,"['email', 'phone']",100%,ΜΟΥΣΕΙΟ-ΕΞΑΡΧΕΙΑ-ΝΕΑΠΟΛΗ,37.98844,23.73845,Entire rental unit,Entire home/apt,2,...,,,896212.0,515.0,long,896212.0,515.0,423.0,3.0,420.0


In [49]:
final_df = final_df.merge(
    amenities_df,
    left_on='id',
    right_on='id',
    how='left'
)
final_df.head(5)

Unnamed: 0,id,host_is_superhost,host_verifications,host_acceptance_rate,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,...,count_negative_english,count_positive_english,amenities,standardized_amenities,num_of_amenities,num_of_other_amenities,has_wifi,has_air_conditioning,num_of_top_10_common_amenities,mean_embedding
0,27262,t,"['email', 'phone']",100%,ΑΜΠΕΛΟΚΗΠΟΙ,37.98924,23.765,Entire rental unit,Entire home/apt,2,...,0.0,17.0,"[""Heating - split type ductless system"", ""Dish...","['OTHER', 'OTHER', 'OTHER', 'OTHER', 'Refriger...",37,25,True,False,9,[-6.15361333e-02 -5.01407236e-02 -3.76954190e-...
1,809874,f,"['email', 'phone']",96%,ΚΟΥΚΑΚΙ-ΜΑΚΡΥΓΙΑΝΝΗ,37.96215,23.72179,Entire rental unit,Entire home/apt,4,...,0.0,82.0,"[""Host greets you"", ""Dishes and silverware"", ""...","['OTHER', 'OTHER', 'Washer', 'TV', 'OTHER', 'R...",21,12,True,True,7,[-5.95615655e-02 -3.39257978e-02 -3.55266146e-...
2,866381,t,"['email', 'phone', 'work_email']",97%,ΝΕΟΣ ΚΟΣΜΟΣ,37.95523,23.72443,Entire condo,Entire home/apt,3,...,0.0,31.0,"[""Coffee"", ""Dishes and silverware"", ""Washer"", ...","['Coffee maker', 'OTHER', 'Washer', 'Condition...",49,29,True,True,12,[-5.62126637e-02 -3.99859361e-02 -3.53127755e-...
3,886724,f,['email'],0%,ΚΥΨΕΛΗ,37.99745,23.73973,Entire rental unit,Entire home/apt,2,...,1.0,16.0,"[""Host greets you"", ""Luggage dropoff allowed"",...","['OTHER', 'OTHER', 'Washer', 'OTHER', 'OTHER',...",20,13,True,True,6,[-5.84196560e-02 -4.62852195e-02 -3.04865874e-...
4,896212,t,"['email', 'phone']",100%,ΜΟΥΣΕΙΟ-ΕΞΑΡΧΕΙΑ-ΝΕΑΠΟΛΗ,37.98844,23.73845,Entire rental unit,Entire home/apt,2,...,3.0,420.0,"[""Host greets you"", ""Coffee"", ""Dishes and silv...","['OTHER', 'Coffee maker', 'OTHER', 'OTHER', 'O...",45,31,True,True,9,[-5.81393391e-02 -4.77810428e-02 -4.26256545e-...


In [155]:
# how to drop target column with NaN?
final_df = final_df.dropna(subset=['target'])

In [153]:
# changing price to a numerical value
final_df['price'] = final_df['price'].replace('[\$,]', '', regex=True).astype(float)

In [158]:
# droppin unnecesery columns
final_df.drop(columns=['listing_id_x', 'listing_id_y'], inplace=True)

In [161]:
# changing host_avaialbility_rate to a number
final_df['host_acceptance_rate'] = final_df['host_acceptance_rate'].replace('[\%,]', '', regex=True).astype(float)

### Statystyki ostatecznego datasetu

In [162]:
final_df_numerical_columns = final_df.select_dtypes(include=[np.number, np.bool]).columns.tolist()
final_df_categorical_columns = final_df.select_dtypes(exclude=[np.number, np.bool]).columns.tolist()

print("Numerical columns: ", final_df_numerical_columns)
print("Categorical columns: ", final_df_categorical_columns)

Numerical columns:  ['id', 'host_acceptance_rate', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'availability_eoy', 'number_of_reviews_ly', 'total_bookings', 'total_reviews', 'total_english_reviews', 'count_negative_english', 'count_positive_english', 'num_of_amenities', 'num_of_other_amenities', 'has_wifi', 'has_air_conditioning', 'num_of_top_10_common_amenities']
Categorical columns:  ['host_is_superhost', 'host_verifications', 'neighbourhood_cleansed', 'property_type', 'room_type', 'bathrooms_text', 'license', 'instant_bookable', 'target', 'amenities', 'standardized_amenities', 'mean_embedding']


# Klasyfikatory

### Model Naiwny - Klasyfikator

In [None]:
# We assume that 0 will represent tha class that is more representative in the dataset
class NaiveClassifier(nn.Module):
    def __init__(self, input_size: int, output: int = 0):
        super(NaiveClassifier, self).__init__()
        self.input_size = input_size
        self.output = output


    def forward(self, x):
        batch_size = x.shape[0]
        logits = torch.zeros((batch_size, 2), dtype=torch.float32)
        logits[:, self.output] = 1.0
        return logits

### Model docelowy - Klasyfikator

In [117]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size: int, output_size: int):
        super(BinaryClassifier, self).__init__()
        self.input_size = input_size
        self.output_size = output_size

        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

        self.actv = F.relu

    def forward(self, x):
        x = self.actv(self.fc1(x))
        x = self.actv(self.fc2(x))
        x = self.fc3(x)

        return x


In [228]:
class BinaryClassifierEmbeddings(nn.Module):
    def __init__(self, num_numeric: int, embedding_sizes: list, output_size: int):
        super(BinaryClassifierEmbeddings, self).__init__()

        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories, emb_dim)
            for num_categories, emb_dim in embedding_sizes
        ])

        emb_dim_total = sum(emb_dim for _, emb_dim in embedding_sizes)
        self.input_size = num_numeric + emb_dim_total

        self.fc1 = nn.Linear(self.input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)

        self.actv = F.relu

    def forward(self, x_cat, x_num):
        embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(embedded + [x_num], dim=1)

        x = self.actv(self.fc1(x))
        x = self.actv(self.fc2(x))
        x = self.fc3(x)
        return x

### Porównanie

In [123]:
def train_test_split_data(df: pd.DataFrame, test_size: float = 0.2, target_column: str = 'target'):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X = X.astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

    # convert targets to int
    y_train = y_train.map({'short': 0, 'long': 1})
    y_test = y_test.map({'short': 0, 'long': 1})

    return X_train, X_test, y_train, y_test


def create_dataloader(x: pd.DataFrame, y: pd.Series, batch_size: int = 32):
    X_tensor = torch.tensor(x.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)

    dataset = TensorDataset(X_tensor, y_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader


In [None]:
def train_classifier(
    model: nn.Module,
    train_loader: DataLoader,
    criterion,
    optimizer,
    num_epochs: int = 10,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        # Wrap train_loader with tqdm for progress display
        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False):

            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

    print("Training complete!")


def evaluate_model(model: nn.Module, test_loader: DataLoader, device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    model.eval()
    all_preds = []
    all_labels = []
    # Use tqdm for the evaluation progress
    for inputs, labels in tqdm(test_loader, desc='Evaluating'):

        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
        all_preds.append(probs)
        all_labels.append(labels)

    all_preds = torch.cat(all_preds).cpu().numpy()
    all_labels = torch.cat(all_labels).cpu().numpy()

    accuracy = accuracy_score(all_labels, np.argmax(all_preds, axis=1))
    roc_auc = roc_auc_score(all_labels, all_preds[:, 1])
    avg_precision = average_precision_score(all_labels, all_preds[:, 1])
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")

    return all_preds, all_labels

#### Wersja 1

In [163]:
numerical_df = final_df[final_df_numerical_columns + ['target']].copy()
numerical_df = numerical_df.fillna(0)
numerical_df = numerical_df.drop(columns=['id'])
numerical_df.head(5)


Unnamed: 0,host_acceptance_rate,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,review_scores_rating,review_scores_accuracy,...,total_reviews,total_english_reviews,count_negative_english,count_positive_english,num_of_amenities,num_of_other_amenities,has_wifi,has_air_conditioning,num_of_top_10_common_amenities,target
0,100.0,37.98924,23.765,2,1.0,1.0,1.0,131.0,4.86,4.89,...,29.0,17.0,0.0,17.0,37,25,True,False,9,short
1,96.0,37.96215,23.72179,4,1.0,1.0,2.0,108.0,4.96,4.97,...,97.0,82.0,0.0,82.0,21,12,True,True,7,short
2,97.0,37.95523,23.72443,3,1.0,1.0,1.0,85.0,4.95,4.98,...,40.0,31.0,0.0,31.0,49,29,True,True,12,long
3,0.0,37.99745,23.73973,2,1.0,1.0,1.0,56.0,4.61,4.88,...,18.0,17.0,1.0,16.0,20,13,True,True,6,short
4,100.0,37.98844,23.73845,2,1.0,1.0,1.0,63.0,4.95,4.94,...,515.0,423.0,3.0,420.0,45,31,True,True,9,long


In [169]:
numerical_df.shape

(28807, 29)

In [99]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [164]:
x_train, x_test, y_train, y_test = train_test_split_data(
    numerical_df,
    test_size=0.2,
    target_column='target'
)

train_dataloader = create_dataloader(x_train, y_train, batch_size=32)
test_dataloader = create_dataloader(x_test, y_test, batch_size=32)

In [None]:
naive_classifier = NaiveClassifier(input_size=len(x_train.columns), output=0).to(device)

classifier_v1 = BinaryClassifier(input_size=len(x_train.columns), output_size=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier_v1.parameters(), lr=0.001)

In [115]:
all_preds, all_labels = evaluate_model(naive_classifier, test_dataloader)

Evaluating: 100%|██████████| 181/181 [00:00<00:00, 723.82it/s]

Accuracy: 0.7805
ROC AUC: 0.5000
Average Precision: 0.2195





In [166]:
train_classifier(
    classifier_v1,
    train_dataloader,
    criterion,
    optimizer,
    num_epochs=50
)
all_preds, all_labels = evaluate_model(classifier_v1, test_dataloader)

                                                              

Epoch [1/50], Loss: 0.3764


                                                              

Epoch [2/50], Loss: 0.2844


                                                              

Epoch [3/50], Loss: 0.2643


                                                              

Epoch [4/50], Loss: 0.2538


                                                              

Epoch [5/50], Loss: 0.2495


                                                              

Epoch [6/50], Loss: 0.2435


                                                              

Epoch [7/50], Loss: 0.2408


                                                              

Epoch [8/50], Loss: 0.2394


                                                              

Epoch [9/50], Loss: 0.2397


                                                               

Epoch [10/50], Loss: 0.2333


                                                               

Epoch [11/50], Loss: 0.2322


                                                               

Epoch [12/50], Loss: 0.2281


                                                               

Epoch [13/50], Loss: 0.2272


                                                               

Epoch [14/50], Loss: 0.2285


                                                               

Epoch [15/50], Loss: 0.2284


                                                               

Epoch [16/50], Loss: 0.2264


                                                               

Epoch [17/50], Loss: 0.2235


                                                               

Epoch [18/50], Loss: 0.2226


                                                               

Epoch [19/50], Loss: 0.2210


                                                               

Epoch [20/50], Loss: 0.2195


                                                               

Epoch [21/50], Loss: 0.2195


                                                               

Epoch [22/50], Loss: 0.2182


                                                               

Epoch [23/50], Loss: 0.2200


                                                               

Epoch [24/50], Loss: 0.2163


                                                               

Epoch [25/50], Loss: 0.2160


                                                               

Epoch [26/50], Loss: 0.2147


                                                               

Epoch [27/50], Loss: 0.2131


                                                               

Epoch [28/50], Loss: 0.2112


                                                               

Epoch [29/50], Loss: 0.2123


                                                               

Epoch [30/50], Loss: 0.2147


                                                               

Epoch [31/50], Loss: 0.2095


                                                               

Epoch [32/50], Loss: 0.2102


                                                               

Epoch [33/50], Loss: 0.2095


                                                               

Epoch [34/50], Loss: 0.2087


                                                               

Epoch [35/50], Loss: 0.2096


                                                               

Epoch [36/50], Loss: 0.2076


                                                               

Epoch [37/50], Loss: 0.2067


                                                               

Epoch [38/50], Loss: 0.2047


                                                               

Epoch [39/50], Loss: 0.2046


                                                               

Epoch [40/50], Loss: 0.2040


                                                               

Epoch [41/50], Loss: 0.2069


                                                               

Epoch [42/50], Loss: 0.2055


                                                               

Epoch [43/50], Loss: 0.2055


                                                               

Epoch [44/50], Loss: 0.2019


                                                               

Epoch [45/50], Loss: 0.2030


                                                               

Epoch [46/50], Loss: 0.1997


                                                               

Epoch [47/50], Loss: 0.1998


                                                               

Epoch [48/50], Loss: 0.2030


                                                               

Epoch [49/50], Loss: 0.2013


                                                               

Epoch [50/50], Loss: 0.2018
Training complete!


Evaluating: 100%|██████████| 181/181 [00:00<00:00, 720.53it/s]

Accuracy: 0.8924
ROC AUC: 0.9531
Average Precision: 0.8252





### Wersja 2

In [170]:
final_df_ohe = final_df.copy()

In [177]:
final_df_ohe.drop(
    columns=['id', 'mean_embedding', 'listing_id_x', 'listing_id_y', 'amenities', 'standardized_amenities', 'neighbourhood_cleansed', 'license'],
    inplace=True,
    errors='ignore'
)
final_df_ohe.head()

Unnamed: 0,host_is_superhost,host_verifications,host_acceptance_rate,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,...,target,total_reviews,total_english_reviews,count_negative_english,count_positive_english,num_of_amenities,num_of_other_amenities,has_wifi,has_air_conditioning,num_of_top_10_common_amenities
0,t,"['email', 'phone']",100.0,37.98924,23.765,Entire rental unit,Entire home/apt,2,1.0,1 bath,...,short,29.0,17.0,0.0,17.0,37,25,True,False,9
1,f,"['email', 'phone']",96.0,37.96215,23.72179,Entire rental unit,Entire home/apt,4,1.0,1 bath,...,short,97.0,82.0,0.0,82.0,21,12,True,True,7
2,t,"['email', 'phone', 'work_email']",97.0,37.95523,23.72443,Entire condo,Entire home/apt,3,1.0,1 bath,...,long,40.0,31.0,0.0,31.0,49,29,True,True,12
3,f,['email'],0.0,37.99745,23.73973,Entire rental unit,Entire home/apt,2,1.0,1 bath,...,short,18.0,17.0,1.0,16.0,20,13,True,True,6
4,t,"['email', 'phone']",100.0,37.98844,23.73845,Entire rental unit,Entire home/apt,2,1.0,1 bath,...,long,515.0,423.0,3.0,420.0,45,31,True,True,9


In [146]:
neighbourhood_cleansed_values = final_df_ohe['neighbourhood_cleansed'].unique()
print(len(neighbourhood_cleansed_values))

171


In [180]:
non_numerical_columns = final_df_ohe.select_dtypes(include=['object', 'category']).columns
non_numerical_columns = non_numerical_columns.drop('target')
print(non_numerical_columns)

Index(['host_is_superhost', 'host_verifications', 'property_type', 'room_type',
       'bathrooms_text', 'instant_bookable'],
      dtype='object')


In [204]:
# one_hot_encoding
ohe_df = pd.get_dummies(final_df_ohe, columns=non_numerical_columns)

In [None]:
ohe_df.drop(columns=['availability_eoy', 'number_of_reviews_ly'], inplace=True)

In [222]:
ohe_df = ohe_df.fillna(0)

In [223]:
x_train, x_test, y_train, y_test = train_test_split_data(
    ohe_df,
    test_size=0.2,
    target_column='target'
)

train_dataloader = create_dataloader(x_train, y_train, batch_size=32)
test_dataloader = create_dataloader(x_test, y_test, batch_size=32)

In [224]:
classifier_v2 = BinaryClassifier(input_size=len(x_train.columns), output_size=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier_v2.parameters(), lr=0.001)

In [225]:
assert not x_train.isnull().any().any(), "NaNs found in x_train"
assert not np.isinf(x_train.to_numpy()).any(), "Infs found in x_train"
assert set(y_train.unique()).issubset({0, 1}), "Invalid labels in y_train"

In [226]:
train_classifier(
    classifier_v2,
    train_dataloader,
    criterion,
    optimizer,
    50,
    device
)

                                                              

Epoch [1/50], Loss: 0.2894


                                                              

Epoch [2/50], Loss: 0.2002


                                                              

Epoch [3/50], Loss: 0.1855


                                                              

Epoch [4/50], Loss: 0.1729


                                                              

Epoch [5/50], Loss: 0.1658


                                                              

Epoch [6/50], Loss: 0.1633


                                                              

Epoch [7/50], Loss: 0.1600


                                                              

Epoch [8/50], Loss: 0.1549


                                                              

Epoch [9/50], Loss: 0.1519


                                                               

Epoch [10/50], Loss: 0.1492


                                                               

Epoch [11/50], Loss: 0.1485


                                                               

Epoch [12/50], Loss: 0.1456


                                                               

Epoch [13/50], Loss: 0.1415


                                                               

Epoch [14/50], Loss: 0.1439


                                                               

Epoch [15/50], Loss: 0.1410


                                                               

Epoch [16/50], Loss: 0.1362


                                                               

Epoch [17/50], Loss: 0.1373


                                                               

Epoch [18/50], Loss: 0.1359


                                                               

Epoch [19/50], Loss: 0.1368


                                                               

Epoch [20/50], Loss: 0.1341


                                                               

Epoch [21/50], Loss: 0.1332


                                                               

Epoch [22/50], Loss: 0.1304


                                                               

Epoch [23/50], Loss: 0.1336


                                                               

Epoch [24/50], Loss: 0.1314


                                                               

Epoch [25/50], Loss: 0.1300


                                                               

Epoch [26/50], Loss: 0.1275


                                                               

Epoch [27/50], Loss: 0.1325


                                                               

Epoch [28/50], Loss: 0.1296


                                                               

Epoch [29/50], Loss: 0.1275


                                                               

Epoch [30/50], Loss: 0.1230


                                                               

Epoch [31/50], Loss: 0.1278


                                                               

Epoch [32/50], Loss: 0.1257


                                                               

Epoch [33/50], Loss: 0.1244


                                                               

Epoch [34/50], Loss: 0.1244


                                                               

Epoch [35/50], Loss: 0.1232


                                                               

Epoch [36/50], Loss: 0.1234


                                                               

Epoch [37/50], Loss: 0.1219


                                                               

Epoch [38/50], Loss: 0.1199


                                                               

Epoch [39/50], Loss: 0.1249


                                                               

Epoch [40/50], Loss: 0.1186


                                                               

Epoch [41/50], Loss: 0.1215


                                                               

Epoch [42/50], Loss: 0.1222


                                                               

Epoch [43/50], Loss: 0.1185


                                                               

Epoch [44/50], Loss: 0.1184


                                                               

Epoch [45/50], Loss: 0.1194


                                                               

Epoch [46/50], Loss: 0.1141


                                                               

Epoch [47/50], Loss: 0.1192


                                                               

Epoch [48/50], Loss: 0.1145


                                                               

Epoch [49/50], Loss: 0.1148


                                                               

Epoch [50/50], Loss: 0.1171
Training complete!




In [227]:
all_preds, all_labels = evaluate_model(classifier_v2, test_dataloader, device)

Evaluating: 100%|██████████| 181/181 [00:00<00:00, 310.22it/s]

Accuracy: 0.9436
ROC AUC: 0.9862
Average Precision: 0.9575





In [233]:
torch.save(classifier_v2.state_dict(), "../models/OHE_CLASSIFIER.pth")

#### Wersja 3

In [246]:
embeddings = final_df.copy()

In [247]:
embeddings = embeddings.drop(columns=['availability_eoy','number_of_reviews_ly', 'id'])
embeddings = embeddings.fillna(0)

In [248]:
emb_categorical_columns = embeddings.select_dtypes(include=['object', 'category']).columns
print(emb_categorical_columns)

Index(['host_is_superhost', 'host_verifications', 'neighbourhood_cleansed',
       'property_type', 'room_type', 'bathrooms_text', 'license',
       'instant_bookable', 'target', 'amenities', 'standardized_amenities',
       'mean_embedding'],
      dtype='object')


In [242]:
emb_numerical_columns = embeddings.select_dtypes(include=['number']).columns
print(emb_numerical_columns)

Index(['host_acceptance_rate', 'latitude', 'longitude', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'price', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'reviews_per_month',
       'total_bookings', 'total_reviews', 'total_english_reviews',
       'count_negative_english', 'count_positive_english', 'num_of_amenities',
       'num_of_other_amenities', 'num_of_top_10_common_amenities'],
      dtype='object')


In [249]:
for col in emb_categorical_columns:
    embeddings[col] = embeddings[col].astype('category')

cat_dims = [len(embeddings[col].cat.categories) for col in emb_categorical_columns]
cat_idxs = [embeddings.columns.get_loc(col) for col in emb_categorical_columns]

In [251]:

x_cat = embeddings[emb_categorical_columns].apply(lambda col: col.cat.codes)
x_num = embeddings[emb_numerical_columns].astype(np.float32)
y = embeddings['target'].map({'short': 0, 'long': 1})

In [252]:
x_cat_train, x_cat_test, x_num_train, x_num_test, y_train, y_test = train_test_split(
    x_cat, x_num, y, test_size=0.2, random_state=42, stratify=y
)

x_cat_train_tensor = torch.tensor(x_cat_train.values, dtype=torch.long)
x_cat_test_tensor = torch.tensor(x_cat_test.values, dtype=torch.long)

x_num_train_tensor = torch.tensor(x_num_train.values, dtype=torch.float32)
x_num_test_tensor = torch.tensor(x_num_test.values, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = TensorDataset(x_cat_train_tensor, x_num_train_tensor, y_train_tensor)
test_dataset = TensorDataset(x_cat_test_tensor, x_num_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [253]:
def train_classifier_emb(
    model: nn.Module,
    train_loader: DataLoader,
    criterion,
    optimizer,
    num_epochs: int = 10,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for x_cat, x_num, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            x_cat = x_cat.to(device)
            x_num = x_num.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(x_cat, x_num)  # updated!
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {running_loss / len(train_loader):.4f}")


def evaluate_model_emb(model: nn.Module, test_loader: DataLoader, device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for x_cat, x_num, labels in tqdm(test_loader, desc='Evaluating'):
            x_cat = x_cat.to(device)
            x_num = x_num.to(device)
            labels = labels.to(device)

            outputs = model(x_cat, x_num)
            probs = F.softmax(outputs, dim=1)  # because CrossEntropyLoss expects raw logits

            all_preds.append(probs)
            all_labels.append(labels)

    all_preds = torch.cat(all_preds).cpu().numpy()
    all_labels = torch.cat(all_labels).cpu().numpy()

    accuracy = accuracy_score(all_labels, np.argmax(all_preds, axis=1))
    roc_auc = roc_auc_score(all_labels, all_preds[:, 1])
    avg_precision = average_precision_score(all_labels, all_preds[:, 1])

    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")

    return all_preds, all_labels


In [254]:
for col in emb_categorical_columns:
    embeddings[col] = embeddings[col].astype('category')

cat_dims = [len(embeddings[col].cat.categories) for col in emb_categorical_columns]
embedding_sizes = [(dim, min(50, (dim + 1) // 2)) for dim in cat_dims]

In [255]:
classifier_v3 = BinaryClassifierEmbeddings(
    num_numeric=len(x_num.columns),
    embedding_sizes=embedding_sizes,
    output_size=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=classifier_v3.parameters(), lr=0.0001)

In [256]:
train_classifier_emb(
    classifier_v3,
    train_loader,
    criterion,
    optimizer,
    50,
    device
)

                                                              

Epoch [1/50] Loss: 0.3378


                                                              

Epoch [2/50] Loss: 0.2363


                                                              

Epoch [3/50] Loss: 0.1436


                                                              

Epoch [4/50] Loss: 0.0679


                                                              

Epoch [5/50] Loss: 0.0274


                                                              

Epoch [6/50] Loss: 0.0115


                                                              

Epoch [7/50] Loss: 0.0066


                                                              

Epoch [8/50] Loss: 0.0021


                                                              

Epoch [9/50] Loss: 0.0010


                                                               

Epoch [10/50] Loss: 0.0006


                                                               

Epoch [11/50] Loss: 0.0003


                                                               

Epoch [12/50] Loss: 0.0002


                                                               

Epoch [13/50] Loss: 0.0001


                                                               

Epoch [14/50] Loss: 0.0001


                                                               

Epoch [15/50] Loss: 0.0000


                                                               

Epoch [16/50] Loss: 0.0000


                                                               

Epoch [17/50] Loss: 0.0000


                                                               

Epoch [18/50] Loss: 0.0000


                                                               

Epoch [19/50] Loss: 0.0000


                                                               

Epoch [20/50] Loss: 0.0000


                                                               

Epoch [21/50] Loss: 0.0000


                                                               

Epoch [22/50] Loss: 0.0000


                                                               

Epoch [23/50] Loss: 0.0000


                                                               

Epoch [24/50] Loss: 0.0000


                                                               

Epoch [25/50] Loss: 0.0000


                                                               

Epoch [26/50] Loss: 0.0000


                                                               

Epoch [27/50] Loss: 0.0000


                                                               

Epoch [28/50] Loss: 0.0000


                                                               

Epoch [29/50] Loss: 0.0000


                                                               

Epoch [30/50] Loss: 0.0000


                                                               

Epoch [31/50] Loss: 0.0000


                                                               

Epoch [32/50] Loss: 0.0000


                                                               

Epoch [33/50] Loss: 0.0000


                                                               

Epoch [34/50] Loss: 0.0000


                                                               

Epoch [35/50] Loss: 0.0000


                                                               

Epoch [36/50] Loss: 0.0000


                                                               

Epoch [37/50] Loss: 0.0000


                                                               

Epoch [38/50] Loss: 0.0000


                                                               

Epoch [39/50] Loss: 0.0019


                                                               

Epoch [40/50] Loss: 0.0000


                                                               

Epoch [41/50] Loss: 0.0000


                                                               

Epoch [42/50] Loss: 0.0000


                                                               

Epoch [43/50] Loss: 0.0000


                                                               

Epoch [44/50] Loss: 0.0000


                                                               

Epoch [45/50] Loss: 0.0000


                                                               

Epoch [46/50] Loss: 0.0000


                                                               

Epoch [47/50] Loss: 0.0000


                                                               

Epoch [48/50] Loss: 0.0000


                                                               

Epoch [49/50] Loss: 0.0000


                                                               

Epoch [50/50] Loss: 0.0000




In [258]:
all_preds, all_labels = evaluate_model_emb(classifier_v3, test_loader, device)

Evaluating: 100%|██████████| 181/181 [00:00<00:00, 385.34it/s]

Accuracy: 1.0000
ROC AUC: 1.0000
Average Precision: 1.0000





# Modele Regresyjne

### Model Naiwny - Regresja

In [None]:
# TODO

### Model docelowy - Regresja