## Import the necessary libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns

## Load the dataset

In [None]:
transactions_train = pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/Data-Science/baseline/dataset/transactions_train.csv')
train_target = pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/Data-Science/baseline/dataset/train_target.csv')
transactions_test = pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/Data-Science/baseline/dataset/transactions_test.csv')
test_id = pd.read_csv('/content/drive/MyDrive/Colab-Notebooks/Data-Science/baseline/dataset/test.csv')

Let's look at data

In [None]:
transactions_train.head()

* **client_id** is a unique identifier of the client

* **trans_date** is the date of the transaction

* **small_group** - purchase category

* **amount_rur** is the transaction amount


In [None]:
train_target.head(5)

* **client_id** is the unique identifier of the client, corresponding to the
client_id field of the transactions
* **bins** - the target variable to predict, this is the customer's age category

## Calculate the simplest aggregation signs for each client

Here, we aggregate transaction data to create summary statistics for each client. This involves calculating the **sum, mean, standard deviation, minimum, and maximum** of transaction amounts for each client.

In [None]:
agg_features_train = transactions_train.groupby('client_id')['amount_rur'].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
agg_features_test = transactions_test.groupby('client_id')['amount_rur'].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()

## Calculate the number of transactions for each category for each client

We count the number of transactions each client has in each purchase category (small_group). This helps in understanding the spending patterns across different categories.


In [None]:
counter_df_train = transactions_train.groupby(['client_id', 'small_group'])['amount_rur'].count()
cat_counts_train = counter_df_train.reset_index().pivot(index='client_id', columns='small_group', values='amount_rur').fillna(0)
cat_counts_train.columns = ['small_group_' + str(i) for i in cat_counts_train.columns]

counter_df_test = transactions_test.groupby(['client_id', 'small_group'])['amount_rur'].count()
cat_counts_test = counter_df_test.reset_index().pivot(index='client_id', columns='small_group', values='amount_rur').fillna(0)
cat_counts_test.columns = ['small_group_' + str(i) for i in cat_counts_test.columns]

## Merge all the files into a single dataframe
We merge the aggregated features with the transaction counts to create a single DataFrame for training and testing.

In [None]:
train = pd.merge(agg_features_train, cat_counts_train.reset_index(), on='client_id')
test = pd.merge(agg_features_test, cat_counts_test.reset_index(), on='client_id'

 The test did not have some categories of expenses, so in order to ensure consistency, we need to combine the feature space of train and test

In [None]:
common_features = list(set(train.columns).intersection(set(test.columns)))
X_train = train[common_features]
X_test = test[common_features]

## Normalize the features

We standardize the features so that they have a mean of 0 and a standard deviation of 1. This helps in improving the performance of clustering algorithms.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns=['client_id']))
X_test_scaled = scaler.transform(X_test.drop(columns=['client_id']))

## Perform K-Means clustering
We apply the K-Means clustering algorithm to group clients based on their transaction features.

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
train['cluster'] = kmeans.fit_predict(X_train_scaled)
test['cluster'] = kmeans.predict(X_test_scaled)

## Visualize the clusters
We visualize the clusters to understand how the clients are grouped. This helps in analyzing the characteristics of each cluster.

In [None]:
sns.pairplot(train, hue='cluster', diag_kind='kde')
plt.show()

## Display the cluster assignments
We assign clusters to both the training and test datasets and prepare the results for submission.
And then we prepare the file to e sent to the system.
This final step includes visualizing the distribution of clusters and saving the results to a CSV file.

In [None]:
print("Cluster assignments for training data:")
print(train[['client_id', 'cluster']].head())

print("Cluster assignments for test data:")
print(test[['client_id', 'cluster']].head())

In [None]:
submission = pd.DataFrame({'client_id': test['client_id'], 'cluster': test['cluster']})
submission['cluster'].plot(kind='hist', bins=20, title='Cluster Distribution')
plt.gca().spines[['top', 'right']].set_visible(False)
plt.show()

In [None]:
# Save the submission file
submission.to_csv('clustered_clients.csv', index=False)