# **Analyzing Brazilian E-Commerce Using Clustering (K-means)**
Dataset : https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (12,8)

In [None]:
df = pd.read_csv('olist_customers_dataset.csv')
df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,payment_sequential,payment_type,...,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,review_score,review_comment_message
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,1,credit_card,...,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares,4,"Não testei o produto ainda, mas ele veio corre..."
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,3,voucher,...,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares,4,"Não testei o produto ainda, mas ele veio corre..."
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,2,voucher,...,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares,4,"Não testei o produto ainda, mas ele veio corre..."
3,128e10d95713541c87cd1a2e48201934,a20e8105f23924cd00833fd87daa0831,delivered,2017-08-15 18:29:31,2017-08-15 20:05:16,2017-08-17 15:28:33,2017-08-18 14:44:43,2017-08-28 00:00:00,1,credit_card,...,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares,4,Deveriam embalar melhor o produto. A caixa vei...
4,0e7e841ddf8f8f2de2bad69267ecfbcf,26c7ac168e1433912a51b924fbd34d34,delivered,2017-08-02 18:24:47,2017-08-02 18:43:15,2017-08-04 17:35:43,2017-08-07 18:30:01,2017-08-15 00:00:00,1,credit_card,...,40.0,268.0,4.0,500.0,19.0,8.0,13.0,housewares,5,"Só achei ela pequena pra seis xícaras ,mais é ..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115609 entries, 0 to 115608
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   order_id                       115609 non-null  object 
 1   customer_id                    115609 non-null  object 
 2   order_status                   115609 non-null  object 
 3   order_purchase_timestamp       115609 non-null  object 
 4   order_approved_at              115595 non-null  object 
 5   order_delivered_carrier_date   114414 non-null  object 
 6   order_delivered_customer_date  113209 non-null  object 
 7   order_estimated_delivery_date  115609 non-null  object 
 8   payment_sequential             115609 non-null  int64  
 9   payment_type                   115609 non-null  object 
 10  payment_installments           115609 non-null  int64  
 11  payment_value                  115609 non-null  float64
 12  customer_unique_id            

***note: run each features one by one**

# **Customer Segmentation**

version 1

In [None]:
# # Select relevant features for segmentation
# features = ['customer_id', 'order_purchase_timestamp', 'payment_value', 'review_score']

# # Filter the dataframe to include only specified features and drop rows with missing values
# df = df[features].dropna()

# # Convert 'order_purchase_timestamp' to datetime
# df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# # Calculate recency (days since last purchase)
# snapshot_date = df['order_purchase_timestamp'].max() + pd.DateOffset(days=1)
# df['recency'] = (snapshot_date - df['order_purchase_timestamp']).dt.days

# # Group by 'customer_id' and aggregate to get RFM (Recency, Frequency, Monetary) metrics
# rfm_df = df.groupby('customer_id').agg({
#     'recency': 'min',                   # Recency: days since last purchase (lower is better)
#     'order_purchase_timestamp': 'count', # Frequency: number of orders
#     'payment_value': 'sum',             # Monetary: total amount spent
#     'review_score': 'mean'              # Average review score
# }).reset_index()

# # Standardize the variables before clustering
# scaler = StandardScaler()
# rfm_scaled = scaler.fit_transform(rfm_df[['recency', 'order_purchase_timestamp', 'payment_value', 'review_score']])

In [None]:
# # Apply K-means clustering
# kmeans = KMeans(n_clusters=7, random_state=4200)
# rfm_df['cluster'] = kmeans.fit_predict(rfm_scaled)
# sample_df = rfm_df.sample(n=1000, random_state=42)

# # Visualize the clusters
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x='recency', y='payment_value', hue='cluster', style='cluster', data=sample_df, palette='Set1', s=50, alpha=0.8)
# plt.title('Customer Segments based on RFM')
# plt.xlabel('Recency (days since last purchase)')
# plt.ylabel('Total Payment Value')
# plt.legend(title='Cluster')
# plt.grid(True)
# plt.tight_layout()
# plt.ylim(0, 100)
# # Explore cluster characteristics
# cluster_summary = rfm_df.groupby('cluster').agg({
#     'recency': 'mean',
#     'order_purchase_timestamp': 'mean',
#     'payment_value': 'mean',
#     'review_score': 'mean',
#     'customer_id': 'size'
# }).rename(columns={'customer_id': 'count'}).reset_index()

# print(cluster_summary)

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# from sklearn.metrics import silhouette_score
# from scipy import stats
# import numpy as np

# # Load your dataset or use sample data
# # Assuming you have already loaded and preprocessed your data similar to the previous examples

# # Select relevant features for segmentation
# features = ['customer_id', 'order_purchase_timestamp', 'payment_value', 'review_score']

# # Filter the dataframe to include only specified features and drop rows with missing values
# df = df[features].dropna()

# # Convert 'order_purchase_timestamp' to datetime
# df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# # Calculate recency (days since last purchase)
# snapshot_date = df['order_purchase_timestamp'].max() + pd.DateOffset(days=1)
# df['recency'] = (snapshot_date - df['order_purchase_timestamp']).dt.days

# # Group by 'customer_id' and aggregate to get RFM (Recency, Frequency, Monetary) metrics
# rfm_df = df.groupby('customer_id').agg({
#     'recency': 'min',                   # Recency: days since last purchase (lower is better)
#     'order_purchase_timestamp': 'count', # Frequency: number of orders
#     'payment_value': 'sum',             # Monetary: total amount spent
#     'review_score': 'mean'              # Average review score
# }).reset_index()

# # Outlier removal using z-score
# numerical_cols = ['recency', 'order_purchase_timestamp', 'payment_value', 'review_score']
# z_scores = np.abs(stats.zscore(rfm_df[numerical_cols]))
# rfm_df_clean = rfm_df[(z_scores < 3).all(axis=1)]

# # Standardize the variables before clustering
# scaler = StandardScaler()
# rfm_scaled = scaler.fit_transform(rfm_df_clean[numerical_cols])

# # Apply PCA to reduce dimensionality
# pca = PCA(n_components=2, random_state=42)
# rfm_pca = pca.fit_transform(rfm_scaled)

# # Evaluate silhouette score for different values of k
# silhouette_scores = []
# for k in range(2, 11):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     cluster_labels = kmeans.fit_predict(rfm_pca)
#     silhouette_avg = silhouette_score(rfm_pca, cluster_labels)
#     silhouette_scores.append(silhouette_avg)
#     print(f"For n_clusters = {k}, the average silhouette score is : {silhouette_avg}")

# # Plot the silhouette scores
# plt.figure(figsize=(10, 6))
# plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='-', color='b')
# plt.xlabel('Number of Clusters (k)')
# plt.ylabel('Silhouette Score')
# plt.title('Silhouette Score for Optimal k')
# plt.grid(True)
# plt.tight_layout()
# plt.show()


version 2

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# from scipy import stats

# # Select relevant features for segmentation
# features = ['customer_id', 'order_purchase_timestamp', 'payment_value', 'review_score']

# # Filter the dataframe to include only specified features and drop rows with missing values
# df = df[features].dropna()

# # Convert 'order_purchase_timestamp' to datetime
# df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# # Calculate recency (days since last purchase)
# snapshot_date = df['order_purchase_timestamp'].max() + pd.DateOffset(days=1)
# df['recency'] = (snapshot_date - df['order_purchase_timestamp']).dt.days

# # Group by 'customer_id' and aggregate to get RFM (Recency, Frequency, Monetary) metrics
# rfm_df = df.groupby('customer_id').agg({
#     'recency': 'min',                   # Recency: days since last purchase (lower is better)
#     'order_purchase_timestamp': 'count', # Frequency: number of orders
#     'payment_value': 'sum',             # Monetary: total amount spent
#     'review_score': 'mean'              # Average review score
# }).reset_index()

# # Outlier removal using z-score
# numerical_cols = ['recency', 'order_purchase_timestamp', 'payment_value', 'review_score']
# z_scores = np.abs(stats.zscore(rfm_df[numerical_cols]))
# rfm_df_clean = rfm_df[(z_scores < 3).all(axis=1)]

# # Standardize the variables before clustering
# scaler = StandardScaler()
# rfm_scaled = scaler.fit_transform(rfm_df_clean[numerical_cols])

# # Apply PCA to reduce dimensionality
# pca = PCA(n_components=2, random_state=42)
# rfm_pca = pca.fit_transform(rfm_scaled)

# # Determine the optimal number of clusters using the Elbow method or silhouette score
# # Here, we use the Elbow method to find the optimal k
# inertia = []
# for k in range(1, 11):
#     kmeans = KMeans(n_clusters=k, random_state=42)
#     kmeans.fit(rfm_pca)
#     inertia.append(kmeans.inertia_)

# # Plot the Elbow curve to identify the optimal k
# plt.figure(figsize=(8, 5))
# plt.plot(range(1, 11), inertia, marker='o', linestyle='-', color='b')
# plt.xlabel('Number of Clusters (k)')
# plt.ylabel('Inertia')
# plt.title('Elbow Method for Optimal k')
# plt.grid(True)
# plt.tight_layout()
# plt.show()

# # Based on the Elbow curve, choose the optimal number of clusters
# optimal_k = 3  # Example: Based on visual inspection of the elbow plot

# # Apply K-means clustering with the optimal k
# kmeans = KMeans(n_clusters=optimal_k, random_state=42)
# rfm_df_clean['cluster'] = kmeans.fit_predict(rfm_pca)

# # Visualize the clusters
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x=rfm_pca[:, 0], y=rfm_pca[:, 1], hue=rfm_df_clean['cluster'], palette='Set1', s=25, alpha=0.8)
# plt.title('Customer Segments based on RFM (PCA) - Outlier Removed')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.legend(title='Cluster')
# plt.grid(True)
# plt.tight_layout()
# plt.show()

# # Explore cluster characteristics
# cluster_summary = rfm_df_clean.groupby('cluster').agg({
#     'recency': 'mean',
#     'order_purchase_timestamp': 'mean',
#     'payment_value': 'mean',
#     'review_score': 'mean',
#     'customer_id': 'size'
# }).rename(columns={'customer_id': 'count'}).reset_index()

# print(cluster_summary)


# **Order Processing Efficiency Analysis**

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.cluster import KMeans
# import matplotlib.colors as mcolors

# # Example data manipulation and visualization
# # Assuming df contains the dataset with columns like 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_customer_date'

# # Convert relevant columns to datetime if they are not already
# df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
# df['order_approved_at'] = pd.to_datetime(df['order_approved_at'])
# df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# # Calculate time differences in hours
# df['time_to_approval'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.total_seconds() / 3600  # in hours
# df['time_to_delivery'] = (df['order_delivered_customer_date'] - df['order_approved_at']).dt.total_seconds() / 3600  # in hours

# # Prepare data for clustering
# X = df[['time_to_approval', 'time_to_delivery']].dropna().reset_index(drop=True)

# # Determine the number of clusters based on business insights or analysis
# n_clusters = 4  # Example: Choose the number of clusters based on analysis

# # Apply K-means clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# cluster_labels = kmeans.fit_predict(X)

# # Ensure X and df are aligned correctly by resetting index after dropna
# X = X.reset_index(drop=True)

# # Assign cluster labels back to the original dataframe using X's index
# df.loc[X.index, 'cluster'] = cluster_labels

# # Define a dynamic color palette based on n_clusters
# colors = sns.color_palette('husl', n_colors=n_clusters)  # Using seaborn's 'husl' palette

# plt.figure(figsize=(16, 6))

# # Histogram of Time to Approval
# plt.subplot(1, 2, 1)
# for i, cluster_label in enumerate(sorted(df['cluster'].unique())):
#     sns.histplot(df[df['cluster'] == cluster_label]['time_to_approval'], bins=20, kde=False, label=f'Cluster {cluster_label}', color=colors[i % n_clusters])
# plt.xlabel('Time to Approval (hours)')
# plt.ylabel('Frequency')
# plt.title('Distribution of Time to Approval by Cluster')
# plt.legend()
# plt.grid(True)

# # Histogram of Time to Delivery
# plt.subplot(1, 2, 2)
# for i, cluster_label in enumerate(sorted(df['cluster'].unique())):
#     sns.histplot(df[df['cluster'] == cluster_label]['time_to_delivery'], bins=20, kde=False, label=f'Cluster {cluster_label}', color=colors[i % n_clusters])
# plt.xlabel('Time to Delivery (hours)')
# plt.ylabel('Frequency')
# plt.title('Distribution of Time to Delivery by Cluster')
# plt.legend()
# plt.grid(True)

# plt.tight_layout()
# plt.show()

# # Display the clustered groups
# print(df.sort_values('cluster'))


# **Product Performance and Customer Satisfaction Analysis**

In [None]:
# from sklearn.cluster import KMeans
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Calculate average review score by product category
# product_reviews = df.groupby('product_category_name_english')['review_score'].mean().reset_index()

# # Sort products by average review score
# product_reviews = product_reviews.sort_values(by='review_score', ascending=False)

# # Prepare data for clustering
# X = product_reviews[['review_score']]

# # Determine the number of clusters based on business insights or analysis
# n_clusters = 4  # Example: Choose the number of clusters based on analysis

# # Apply K-means clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# product_reviews['cluster'] = kmeans.fit_predict(X)

# # Visualize clusters using scatter plot
# plt.figure(figsize=(12, 8))
# sns.scatterplot(x='product_category_name_english', y='review_score', hue='cluster', data=product_reviews, palette='viridis', s=100)
# plt.xlabel('Product Category')
# plt.ylabel('Average Review Score')
# plt.title('Average Review Score by Product Category with Clusters')
# plt.xticks(rotation=90)
# plt.legend(title='Cluster')
# plt.tight_layout()

# plt.show()

# # Display the clustered groups
# print(product_reviews.sort_values('cluster'))


In [None]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# # Assuming df contains the dataset with a column 'review_comment_message'

# # Filter out rows where review_comment_message is null or empty
# df = df.dropna(subset=['review_comment_message']).reset_index(drop=True)

# # Use TF-IDF to convert review comments into numerical features
# vectorizer = TfidfVectorizer(max_features=1000, max_df=0.8, stop_words='english')
# X = vectorizer.fit_transform(df['review_comment_message'])

# # Apply PCA to reduce dimensions for visualization (optional)
# pca = PCA(n_components=2)
# X_pca = pca.fit_transform(X.toarray())

# # Determine the number of clusters based on analysis or exploration
# n_clusters = 4  # Example: Choose the number of clusters

# # Apply K-means clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# df['cluster'] = kmeans.fit_predict(X)

# # Visualize clusters (using PCA components)
# plt.figure(figsize=(10, 6))
# scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['cluster'], cmap='viridis', alpha=0.7, s=20)
# plt.title('Clustering Review Comments')
# plt.colorbar(scatter, label='Cluster')
# plt.tight_layout()
# plt.show()

# # Display cluster centers (if needed)
# print("Cluster Centers:")
# print(kmeans.cluster_centers_)

# # Display top terms per cluster (if needed)
# terms = vectorizer.get_feature_names_out()
# order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
# for i in range(n_clusters):
#     print(f"Cluster {i} top terms:", end='')
#     for ind in order_centroids[i, :10]:
#         print(f' {terms[ind]}', end='')
#     print()

# # Analyze cluster distribution
# cluster_counts = df['cluster'].value_counts().sort_index()
# print("\nCluster Distribution:")
# print(cluster_counts)


# **Payment Methods and Preferences Analysis**

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler

# # Assuming df contains your dataset
# # Example data manipulation and visualization

# # Select relevant features for segmentation
# features = ['customer_id', 'order_purchase_timestamp', 'payment_value', 'review_score', 'product_photos_qty', 'review_comment_message', 'payment_type']

# # Filter the dataframe to include only specified features and drop rows with missing values
# df = df[features].dropna()

# # Convert 'order_purchase_timestamp' to datetime object
# df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# # Select numerical columns for scaling and clustering
# numerical_cols = ['payment_value', 'review_score', 'product_photos_qty']

# # Scale the numerical columns
# scaler = StandardScaler()
# df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# # Apply K-means clustering
# n_clusters = 5  # Number of clusters
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# df['cluster'] = kmeans.fit_predict(df[numerical_cols])

# # Example: Analyzing payment methods and preferences
# payment_counts = df['payment_type'].value_counts()

# plt.figure(figsize=(8, 6))
# plt.bar(payment_counts.index, payment_counts.values, color='skyblue')
# plt.xlabel('Payment Method')
# plt.ylabel('Number of Transactions')
# plt.title('Distribution of Payment Methods')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

# # Analyze payment preferences by cluster
# payment_cluster = df.groupby('cluster')['payment_type'].value_counts(normalize=True).unstack()
# payment_cluster.fillna(0, inplace=True)  # Replace NaNs with 0 assuming no transactions means 0 usage

# print("Payment Preferences by Cluster:")
# print(payment_cluster)


# **Product Category Insights**

In [None]:
# # Select relevant features including 'product_category_name_english'
# features = ['customer_id', 'order_purchase_timestamp', 'payment_value', 'review_score', 'product_photos_qty', 'review_comment_message', 'product_category_name_english']

# # Filter the dataframe to include only specified features and drop rows with missing values
# df = df[features].dropna()

# # Perform any necessary preprocessing or scaling of numerical features here if needed

# # Apply K-means clustering on numerical features (example assuming numerical_cols exist and are properly scaled)
# numerical_cols = ['payment_value', 'review_score', 'product_photos_qty']
# kmeans = KMeans(n_clusters=5, random_state=42)
# df['cluster'] = kmeans.fit_predict(df[numerical_cols])

# # Example: Analyzing product category insights
# category_counts = df['product_category_name_english'].value_counts().nlargest(10)

# plt.figure(figsize=(10, 6))
# plt.bar(category_counts.index, category_counts.values, color='lightgreen')
# plt.xlabel('Product Category')
# plt.ylabel('Number of Orders')
# plt.title('Top 10 Product Categories by Order Volume')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()

# # Analyze product category preferences by cluster
# category_cluster = df.groupby('cluster')['product_category_name_english'].value_counts(normalize=True).unstack()
# print("Product Category Preferences by Cluster:")
# print(category_cluster)

# # Iterate through each cluster and create scatter plots
# for cluster in category_cluster.index:
#     plt.figure(figsize=(14, 6))  # Adjust size as needed
#     plt.scatter(category_cluster.columns, category_cluster.loc[cluster], label=f'Cluster {cluster}', s=20)
#     plt.title(f'Product Category Preferences - Cluster {cluster}')
#     plt.xlabel('Product Category')
#     plt.ylabel('Preference (Normalized)')
#     plt.xticks(rotation=45, ha='right')
#     plt.legend()
#     plt.tight_layout()


# **Predictive Analytics and Forecasting**

In [None]:


# # Select features and target variable
# features = ['product_name_lenght', 'product_description_lenght', 'product_photos_qty',
#             'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']
# target = 'review_score'

# # Filter the dataframe to include only specified features and drop rows with missing values
# df_filtered = df[features + [target]].dropna()

# # Split data into training and testing sets
# X = df_filtered[features]
# y = df_filtered[target]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Fit linear regression model
# model = LinearRegression()
# model.fit(X_train, y_train)

# # Predict on test set
# y_pred = model.predict(X_test)

# # Evaluate model performance
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error: {mse}")

# # Example of predicting review_score based on new data
# new_data = pd.DataFrame({
#     'product_name_lenght': [50, 60],
#     'product_description_lenght': [200, 300],
#     'product_photos_qty': [5, 8],
#     'product_weight_g': [1500, 2000],
#     'product_length_cm': [30, 40],
#     'product_height_cm': [20, 25],
#     'product_width_cm': [15, 20]
# })
# predicted_review_scores = model.predict(new_data)
# print("Predicted Review Scores:")
# print(predicted_review_scores)

Mean Squared Error: 1.8928665443421595
Predicted Review Scores:
[4.05324131 4.07414129]


# **Geographical Analysis**

In [None]:
# # Calculate the length of customer city names
# df['city_length'] = df['customer_city'].apply(len)

# # Group by city name lengths
# city_length_groups = df.groupby('city_length').size().reset_index(name='count')

# # Use city name lengths as features for clustering
# X = city_length_groups[['city_length']]

# # Determine the number of clusters based on unique city name lengths
# n_clusters = len(city_length_groups)

# # Apply K-means clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# city_length_groups['cluster'] = kmeans.fit_predict(X)

# # Plotting clusters
# plt.figure(figsize=(8, 6))
# plt.scatter(city_length_groups['city_length'], city_length_groups['count'], c=city_length_groups['cluster'], cmap='viridis', s=100)
# plt.xlabel('City Name Length')
# plt.ylabel('Number of Cities')
# plt.title('Clustering City Name Lengths')
# plt.colorbar(label='Cluster')
# plt.tight_layout()
# plt.show()

# # Display the clustered groups
# print(city_length_groups)

In [None]:
# # Calculate total payment value by state
# state_payment = df.groupby('customer_state')['payment_value'].sum().reset_index()

# # Display each state with its total payment value
# print("Total Payment Value by State:")
# print(state_payment)

# # Prepare data for clustering
# X = state_payment[['payment_value']]

# # Determine the number of clusters based on business insights or analysis
# n_clusters = 6  # Example: Choose the number of clusters based on analysis

# # Apply K-means clustering
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# state_payment['cluster'] = kmeans.fit_predict(X)

# # Visualize clusters
# plt.figure(figsize=(10, 6))
# for cluster in state_payment['cluster'].unique():
#     cluster_data = state_payment[state_payment['cluster'] == cluster]
#     plt.scatter(cluster_data['customer_state'], cluster_data['payment_value'], label=f'Cluster {cluster}')

# plt.xlabel('Customer State')
# plt.ylabel('Total Payment Value')
# plt.title('Clustering States by Total Payment Value')
# plt.xticks(rotation=45)
# plt.legend()
# plt.tight_layout()
# plt.show()

# # Display the clustered groups
# print("\nClustered Groups:")
# print(state_payment.sort_values('cluster'))

analized by regani_a