# Library

## Imports

In [1]:
from typing import Optional

from transformers.models.esm.openfold_utils.protein import from_prediction
!pip install kagglehub



In [2]:
import pandas as pd
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import math
from typing import Optional 

## Plotting library

Plotting of outliers and quantiles for a feature  

In [3]:
def plot_box_plots(data):
    # Limit the number of features plotted
    num_features = len(data.columns)

    # Create a figure and axes once
    fig, axes = plt.subplots(nrows=num_features, ncols=1, figsize=(12, num_features * 3))

    # Handle case with only one feature
    if num_features == 1:
        axes = [axes]

    # Efficiently plot each feature
    for ax, feature in zip(axes, data.columns[:num_features]):
        print(feature)
        sns.boxplot(x=data[feature], ax=ax)
        ax.set_title(f'Box plot for {feature}')
        ax.set_xlim(data[feature].min() - 1, data[feature].max() + 1)

    plt.tight_layout()
    plt.show()

Plot the correlation heatmap

In [4]:
def plot_correlation_matrix(df, method='spearman'):
    # Calculate correlation matrix
    corr_matrix = df.corr(method=method)

    # Create a heatmap to visualize the correlation matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, cbar=True, square=True)
    plt.title(f'Correlation Heatmap ({method})')
    plt.show()

## Distribution library

This class help to analyse distributions of the features.

It provides **2** methods:
- for plotting numerical feature's distribution
- for plotting frequency for nominal value

In [5]:
class DistributionAnalyser:
    def __init__(self, df):
        self._df = df.copy()

    # The function that plots numerical features distribution
    def plot_numerical_distributions(self, numerical_features, num_columns=2):
        # Calculate the number of numerical features in the dataset
        num_features = len(numerical_features)

        # Determine the number of rows needed to arrange the subplots in a grid
        num_rows = math.ceil(num_features / num_columns)

        # Set the figure size dynamically based on the number of rows
        plt.figure(figsize=(15, (num_rows + 1) * 5))

        # Loop through each numerical feature to create a histogram
        for i, feature in enumerate(numerical_features):
            # Define the subplot position in the grid
            plt.subplot(num_rows, num_columns, i + 1)

            # Plot the histogram for the current numerical feature
            sns.histplot(self._df[feature], bins=30, kde=False)

            # Set labels and title for the plot
            plt.xlabel('Value')
            plt.ylabel('Frequency')
            plt.title(feature)

        # Adjust the layout to prevent overlapping plots
        plt.tight_layout()

        # Display the figure with all subplots
        plt.show()

    # The function that plots categorical features relation frequency
    def plot_categorical_relation_frequency(self, categorical_features, num_columns=2, top_n=100, height=5):
        # Calculate the number of categorical features in the dataset
        num_features = len(categorical_features)

        # Determine the number of rows needed for the grid layout
        num_rows = math.ceil(num_features / num_columns)

        # Set the figure size dynamically based on the number of rows and specified height
        plt.figure(figsize=(15, num_rows * height))

        # Loop through each categorical feature to create bar plots
        for i, feature in enumerate(categorical_features):
            # Define the subplot position in the grid
            plt.subplot(num_rows, num_columns, i + 1)

            # Check if the column contains list-like entries (e.g., lists or sets) and explode if necessary
            if self._df[feature].apply(lambda x: isinstance(x, list)).any():
                current_values = self._df.explode(feature)[feature].dropna()
            else:
                current_values = self._df[feature].dropna()

            # Calculate the relative frequency of the top_n most frequent categories
            current_values = current_values.value_counts().head(top_n) / len(self._df[feature])

            # Plot the bar chart for the categorical feature
            sns.barplot(x=current_values.values, y=current_values.index, palette='viridis')

            # Set labels and title for better interpretability
            plt.xlabel('Frequency')
            plt.ylabel('Value')
            plt.title(f'Distribution of {feature}')

            # Rotate y-axis labels if there are many categories or long category names
            if len(current_values) > 10 or any(len(str(label)) > 15 for label in current_values.index):
                plt.yticks(rotation=0)

        # Adjust layout to prevent overlapping elements
        plt.tight_layout()

        # Display the figure with all subplots
        plt.show()

## Missing values library

This class is used for two purposes:
- draw a heatmap of missing values in the datasets' features
- show matrix between **features / number of values**

In [6]:
class NanAnalyser:
    def __init__(self, df):
        self._df = df.copy()

    def plot_nan_values_heatmap(self):
        """
        Generate a heatmap to visualize missing values in the dataframe.

        - Uses a seaborn heatmap where missing values are highlighted.
        - The heatmap helps in identifying patterns of missing data.
        """
        sns.heatmap(self._df.isnull(), cbar=False, cmap="viridis")
        plt.title("Missing Values Heatmap")
        plt.show()

    def get_nan_statistic(self) -> pd.DataFrame:
        """
        Calculate and return statistics about missing values in the dataframe.

        - Computes the count and percentage of missing values for each column.
        - Filters out columns with no missing values.
        - Returns a summary as a DataFrame.
        """
        # Calculate the count of missing values for each column
        missing_count = self._df.isnull().sum()

        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_count / len(self._df)) * 100

        # Filter out columns that do not have missing values
        missing_data = missing_count[missing_count > 0]
        missing_data_percentage = missing_percentage[missing_percentage > 0]

        # Create a summary DataFrame with missing values statistics
        missing_summary = pd.DataFrame({
            'Missing Values Count': missing_data,
            'Missing Values Percentage': missing_data_percentage
        })

        return missing_summary

## Time series library

This class helps to create **time series**

In [7]:
class TimeSeriesGenerator:
    def __init__(self, df: pd.DataFrame):
        self._df = df

    def plot_time_series(self, date_col: str, target_col: str, group_col: Optional[str]):
        """
        Plots the time series data with separate lines for each group.
        :param date_col: Name of the date column
        :param target_col: Name of the target feature column
        :param group_col: Name of the column for grouping
        """
        grouped_data = self._create_time_series(date_col, target_col, group_col)
        plt.figure(figsize=(10, 5))
        
        if group_col:
            for group in grouped_data[group_col].unique():
                subset = grouped_data[grouped_data[group_col] == group]
                plt.plot(subset[date_col], subset[target_col], label=f'Group {group}')
                
            plt.title(f"Time series by group {group_col}")
        
        plt.xlabel(date_col)
        plt.ylabel(f"Avg {target_col}")
        plt.legend()
        plt.grid(True)
        plt.show()
    
    def _create_time_series(self, date_col: str, target_col: str, group_col: str) -> pd.DataFrame:
        """
        Creates a time series by aggregating data based on the specified grouping column and calculating the average target column value.
        :param date_col: Name of the date column
        :param target_col: Name of the target feature column
        :param group_col: Name of the column for grouping
        :return: pandas DataFrame with the time series
        """
        self._df[date_col] = pd.to_datetime(self._df[date_col])
        grouped = (self._df.groupby([date_col, group_col])[target_col]
                   .mean()
                   .reset_index())
        return grouped

# Datasets loading and filtering 

## Reasoning of the datasets' choice 

The following datasets were chosen to explore and use for **SVD++** since they have information either about the relationships between users and businesses (this information is the purpose of the RecSys to restore) or about some features of users or items itself (such as business dataset):
- business dataset;
- tip dataset
- review

User dataset contains only information that is related to user, but not to U-I relations as well as checkin dataset that contain potential feature only for business.

Normally datasets are downloaded from the Kaggle, but in the purpose of time-saving the local path placed instead.
If you want to download the datasets again, just uncomment the code below.

In [8]:
path = "/Users/simon/.cache/kagglehub/datasets/yelp-dataset/yelp-dataset/versions/4"

# path = kagglehub.dataset_download("yelp-dataset/yelp-dataset")
# 
# print("Path to dataset files:", path)

Business' features description:
- `business_id` - id of a business
- `name` - name of the business
- `address` - address of the business (geographical data)
- `city` - city of the business (geographical data)
- `state` - state of the business (geographical data)
- `postal_code` - postal code of the business (geographical data)
- `latitude` - latitude of the business (geographical data)
- `longitude` - longitude of the business (geographical data)
- `review_count` - the amount of the reviews gathered for the particular business
- `is_open` - is the business opened (closed businesses are not relevant to use)
- `attributes` - business' attributes (no provided context)
- `categories` - categories related to the business
- `hours` - working hours

In [9]:
business_df = pd.read_json(f"{path}/yelp_academic_dataset_business.json", lines=True)
business_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


Reasons of feature dropping:
- geographical data doesn't make any sense since we don't have the same type of information about the user
- there are features that don't describe anything in the purpose of RecSys (i.e. `name` or `hours`)
- `attributes` feature doesn't have a context to analyze it

Features that remains:
- `business_id` - id of a business
- `review_count` - this feature can potentially participate in the forming of **implicit rating**
- `categories` - this feature can potentially participate in the forming of **implicit rating** + it's necessary to separate it into different entity
- `is_open` - it's necessary to check ratio between open / closed businesses and filter them based on this feature since the recommendation of the closed business doesn't make any sense

In [10]:
filtered_business_id = business_df[['business_id', 'review_count', 'is_open', 'categories']]
filtered_business_id

Unnamed: 0,business_id,review_count,is_open,categories
0,Pns2l4eNsfO8kk83dixA6A,7,0,"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,15,1,"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,22,0,"Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,13,1,"Brewpubs, Breweries, Food"
...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,13,1,"Nail Salons, Beauty & Spas"
150342,c8GjPIOTGVmIemT7j5_SyQ,5,1,"Pets, Nurseries & Gardening, Pet Stores, Hobby..."
150343,_QAMST-NrQobXduilWEqSw,8,1,"Shopping, Jewelry, Piercing, Toy Stores, Beaut..."
150344,mtGm22y5c2UHNXDFAjaPNw,24,1,"Fitness/Exercise Equipment, Eyewear & Optician..."


Reviews' features:
- `review_id` | `user_id` | `business_id` - id of the review and foreign keys (one user can leave several reviews for one item)
- `stars` - **explicit rating** provided by user for the particular item in the particular moment
- `useful` | `funny` | `cool`  - user's flags about (presumably) review. We don't drop this feature since it's necessary to get any evidences that theory about the nature of the feature is right - check the **opportunity of usage them for implicit rating** 
- `text` - the content of review (can be useful for potential sentimental analysis)
- `date` - the timestamp of review

All the features can be used for the future development and need to be analysed.

In [None]:
review_df = pd.read_json(f"{path}/yelp_academic_dataset_review.json", lines=True)
review_df

Tip's features:
- `user_id` | `business_id` - ids of users and businesses with uncertainty in uniqueness of them and their pairs
- `text` - content of tip (can be useful for potential sentimental analysis)
- `date` - the date of publication
- `compliment_count` - how many users complimented a particular tip - **consider opportunity of usage in implicit rating**

In [None]:
tip_df = pd.read_json(f"{path}/yelp_academic_dataset_tip.json", lines=True)
tip_df

The only feature was dropped - `compliment_count` because of the following reason: this feature describes **user-to-user** relationships and would be useful if we've wanted to recommend *users to other users*. But the purpose of the RecSys under development is recommending **items to user**

In [None]:
filtered_tip_df = tip_df.drop(axis=1, columns=['compliment_count'])
filtered_tip_df

# EDA

## Reviews dataset

### Basic descriptive statistic

In [None]:
review_df = review_df.set_index('business_id')
review_df

The conclusions from the descriptive statistics:
- All the features have the same amount of items so **there is no Nans** (need to recheck)
- At least `cool` and `funny` (probably `useful` also) have a lot of outliers (checking 75-quantile). At the same time, the max value is dramatically big. This is a relative evidence for the theory about these features are given from different users and describes **user-to-user interaction**
- Following the previous point, described features are candidates for dropping, but it's necessary to recheck amount of outliers, because it's possible that they can be useful for implicit ratings. 
- Time range of the data goes from **2005** until **2022** which let us to assume that we've enough historical data that give us the opportunity to create **time series** and **implicit ratings** based on the history data

In [None]:
review_df.describe()

All the features that will be explored are **continuous** (**numerical**)

In [None]:
review_df.dtypes

Remove text from analyzed feature since for the sentiment analyses it's necessary to provide different analysis for such **type of data**

In [None]:
FEATURES_FOR_ANALYSIS = ['stars', 'useful', 'funny', 'cool', 'date']

### Nan analysis

As it's possible to mention below, **assumption was proved** - there is no Nans in dataset at all

In [None]:
nan_analyser = NanAnalyser(review_df)

In [None]:
nan_analyser.plot_nan_values_heatmap()

In [None]:
nan_analyser.get_nan_statistic()

### Distribution and outliers analysis

In [None]:
distribution_analyser = DistributionAnalyser(review_df)

In [None]:
distribution_analyser.plot_numerical_distributions(FEATURES_FOR_ANALYSIS)

In [None]:
plot_box_plots(review_df)

### Correlation analysis

Extract numerical features

In [None]:
NUMERICAL_FEATURES = ['stars', 'useful', 'funny', 'cool']

In [None]:
plot_correlation_matrix(review_df[NUMERICAL_FEATURES], method='pearson')

In [None]:
plot_correlation_matrix(review_df[NUMERICAL_FEATURES])

### Time series analysis

In [None]:
time_series = TimeSeriesGenerator(review_df)

time_series.plot_time_series(date_col='date', target_col='stars', group_col='business_id')

### Conclusions

1. Check out the opportunity of stratifying the **review** dataset