In [None]:
import pandas as pd

# Load the data from the provided files
restaurants_kl = pd.read_csv('/mnt/data/Restaurants_KL.csv')
restaurants_rome = pd.read_csv('/mnt/data/Restaurants_Rome.csv')
reviews_kl = pd.read_csv('/mnt/data/reviews_all_KL.csv')
reviews_rome = pd.read_csv('/mnt/data/reviews_all_Rome.csv')

# Display the first few rows of each file to understand the structure
{
    "restaurants_kl_head": restaurants_kl.head(),
    "restaurants_rome_head": restaurants_rome.head(),
    "reviews_kl_head": reviews_kl.head(),
    "reviews_rome_head": reviews_rome.head()
}


In [None]:
import matplotlib.pyplot as plt

# Clean the 'no_reviews' column to ensure it's in numeric format
restaurants_kl['no_reviews'] = restaurants_kl['no_reviews'].str.replace(',', '').astype(int)
restaurants_rome['no_reviews'] = restaurants_rome['no_reviews'].str.replace(',', '').astype(int)

# Calculate the distribution of reviews for each restaurant in KL and Rome
review_distribution_kl = restaurants_kl['no_reviews'].describe()
review_distribution_rome = restaurants_rome['no_reviews'].describe()

# Plot the distribution of reviews for each restaurant
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
restaurants_kl['no_reviews'].plot(kind='hist', bins=30, ax=ax1, title='KL Restaurant Reviews Distribution')
restaurants_rome['no_reviews'].plot(kind='hist', bins=30, ax=ax2, title='Rome Restaurant Reviews Distribution')
plt.tight_layout()

review_distribution_kl, review_distribution_rome, plt.show()


In [None]:
# Calculate the distribution of unique users in reviews for KL and Rome
unique_users_kl = reviews_kl['Author'].nunique()
unique_users_rome = reviews_rome['Author'].nunique()

# Count total reviews for comparison
total_reviews_kl = reviews_kl['Author'].count()
total_reviews_rome = reviews_rome['Author'].count()

# Calculate the ratio of unique users to total reviews
unique_to_total_ratio_kl = unique_users_kl / total_reviews_kl
unique_to_total_ratio_rome = unique_users_rome / total_reviews_rome

{
    "unique_users_kl": unique_users_kl,
    "total_reviews_kl": total_reviews_kl,
    "unique_to_total_ratio_kl": unique_to_total_ratio_kl,
    "unique_users_rome": unique_users_rome,
    "total_reviews_rome": total_reviews_rome,
    "unique_to_total_ratio_rome": unique_to_total_ratio_rome
}


In [None]:
# Assessing the difficulty based on volume and variability
# High volume and high variability might indicate more complex data analysis

# Get a basic statistical overview of the number of reviews for both cities
review_count_stats_kl = reviews_kl.groupby('Restaurant').size().describe()
review_count_stats_rome = reviews_rome.groupby('Restaurant').size().describe()

# Plot the number of reviews per restaurant to visualize the spread and identify any outliers or heavy tails
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
reviews_kl.groupby('Restaurant').size().plot(kind='box', ax=ax1, title='KL Reviews per Restaurant')
reviews_rome.groupby('Restaurant').size().plot(kind='box', ax=ax2, title='Rome Reviews per Restaurant')
plt.tight_layout()

review_count_stats_kl, review_count_stats_rome, plt.show()


In [None]:
# Adding a proper name to the series representing the count of reviews for each restaurant
review_count_kl = reviews_kl.groupby('Restaurant').size().rename('review_count')
review_count_rome = reviews_rome.groupby('Restaurant').size().rename('review_count')

# Merge review count data with restaurant data for more comprehensive analysis
kl_data = restaurants_kl.set_index('Restaurant').join(review_count_kl, how='left')
rome_data = restaurants_rome.set_index('Restaurant').join(review_count_rome, how='left')

# Normalize data by calculating z-scores for star ratings and review counts to compare performance
kl_data['star_rating_z'] = (kl_data['star_rating'] - kl_data['star_rating'].mean()) / kl_data['star_rating'].std()
kl_data['review_count_z'] = (kl_data['review_count'] - kl_data['review_count'].mean()) / kl_data['review_count'].std()
rome_data['star_rating_z'] = (rome_data['star_rating'] - rome_data['star_rating'].mean()) / rome_data['star_rating'].std()
rome_data['review_count_z'] = (rome_data['review_count'] - rome_data['review_count'].mean()) / rome_data['review_count'].std()

# Classify based on z-scores
kl_data['performance_class'] = pd.cut(kl_data['star_rating_z'] + kl_data['review_count_z'], bins=[-float('inf'), 0, float('inf')], labels=['Lower', 'Higher'])
rome_data['performance_class'] = pd.cut(rome_data['star_rating_z'] + rome_data['review_count_z'], bins=[-float('inf'), 0, float('inf')], labels=['Lower', 'Higher'])

# Plotting the classification
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
kl_data.plot.scatter(x='star_rating_z', y='review_count_z', c=kl_data['performance_class'].cat.codes, colormap='viridis', ax=ax1, title='KL Performance Classification')
rome_data.plot.scatter(x='star_rating_z', y='review_count_z', c=rome_data['performance_class'].cat.codes, colormap='viridis', ax=ax2, title='Rome Performance Classification')
plt.tight_layout()

# Show the figures and return the proportion of high performance restaurants
high_perf_kl = kl_data['performance_class'].value_counts(normalize=True)['Higher']
high_perf_rome = rome_data['performance_class'].value_counts(normalize=True)['Higher']

high_perf_kl, high_perf_rome, plt.show()


In [None]:
# Filter out rows with NaT values in the 'Dates' column
filtered_reviews_kl = reviews_kl.dropna(subset=['Dates'])
filtered_reviews_rome = reviews_rome.dropna(subset=['Dates'])

# Time-series analysis: Count reviews over time by month
time_series_kl = filtered_reviews_kl.set_index('Dates').resample('M').size()
time_series_rome = filtered_reviews_rome.set_index('Dates').resample('M').size()

# Plotting time-series of review counts
plt.figure(figsize=(14, 6))
plt.plot(time_series_kl.index, time_series_kl.values, label='KL')
plt.plot(time_series_rome.index, time_series_rome.values, label='Rome')
plt.title('Time-Series Analysis of Reviews')
plt.xlabel('Date')
plt.ylabel('Number of Reviews')
plt.legend()
plt.tight_layout()

# Now we recalculate the basic statistics for restaurant data between KL and Rome without the time-series error
restaurant_comparison


In [None]:
# Analyze the cuisine type / cuisine type number differences between KL and Rome

# Extract and count unique cuisine types for each city
kl_cuisines = restaurants_kl['cuisine'].str.split(', ').explode().unique()
rome_cuisines = restaurants_rome['cuisine'].str.split(', ').explode().unique()

# Count the number of different cuisine types
kl_cuisine_count = len(kl_cuisines)
rome_cuisine_count = len(rome_cuisines)

# Prepare data for visualization
cuisine_comparison = pd.DataFrame({
    'KL': pd.Series(restaurants_kl['cuisine'].str.split(', ').explode()).value_counts(),
    'Rome': pd.Series(restaurants_rome['cuisine'].str.split(', ').explode()).value_counts()
})

# Plot the cuisine type comparison
cuisine_comparison.plot(kind='bar', figsize=(14, 6), title='Cuisine Type Comparison between KL and Rome')
plt.ylabel('Number of Restaurants')

kl_cuisine_count, rome_cuisine_count, plt.show()
