# Imports

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

# Load Dataset 

In [2]:
data = pd.read_csv('cleaned_dataset.csv')

# Select relevant categorical columns for pattern mining

In [3]:
columns_of_interest = ['Category', 'Free', 'Content Rating', 'Ad Supported', 'In app purchases', 'Editor Choice']

# Create a list of transactions

In [4]:
transactions = []
for _, row in data.iterrows():
    transaction = []
    for col in columns_of_interest:
        transaction.append(f"{col}={row[col]}")
    transactions.append(transaction)

# Use TransactionEncoder to transform the list of transactions into a one-hot encoded DataFrame

In [5]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Display the first few rows of the transformed DataFrame
print("Transformed Data:")
print(df.head())

Transformed Data:
   Ad Supported=True  Category=Action  Category=Adventure  Category=Arcade  \
0               True            False               False            False   
1               True            False               False            False   
2               True            False               False            False   
3               True            False               False            False   
4               True            False               False            False   

   Category=Art & Design  Category=Auto & Vehicles  Category=Beauty  \
0                  False                     False            False   
1                  False                     False            False   
2                  False                     False            False   
3                  False                     False            False   
4                  False                     False            False   

   Category=Board  Category=Books & Reference  Category=Business  ...  \
0           F

# Apply the Apriori algorithm to find frequent itemsets

In [6]:
frequent_itemsets = apriori(df, min_support=0.7, use_colnames=True)

# Display the frequent itemsets
print("Frequent Itemsets:")
print(frequent_itemsets)

Frequent Itemsets:
     support                                           itemsets
0   1.000000                                (Ad Supported=True)
1   0.818238                          (Content Rating=Everyone)
2   0.998413                              (Editor Choice=False)
3   0.992681                                        (Free=True)
4   0.755447                           (In app purchases=False)
5   0.818238       (Content Rating=Everyone, Ad Supported=True)
6   0.998413           (Editor Choice=False, Ad Supported=True)
7   0.992681                     (Free=True, Ad Supported=True)
8   0.755447        (In app purchases=False, Ad Supported=True)
9   0.817273     (Editor Choice=False, Content Rating=Everyone)
10  0.812683               (Free=True, Content Rating=Everyone)
11  0.991098                   (Free=True, Editor Choice=False)
12  0.755344      (In app purchases=False, Editor Choice=False)
13  0.749911                (Free=True, In app purchases=False)
14  0.817273  (Editor

# Extract association rules from the frequent itemsets


In [7]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the association rules
print("Association Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

Association Rules:
                                    antecedents  \
0                     (Content Rating=Everyone)   
1                           (Ad Supported=True)   
2                         (Editor Choice=False)   
3                           (Ad Supported=True)   
4                                   (Free=True)   
..                                          ...   
65     (Editor Choice=False, Ad Supported=True)   
66  (In app purchases=False, Ad Supported=True)   
67                        (Editor Choice=False)   
68                          (Ad Supported=True)   
69                     (In app purchases=False)   

                                          consequents   support  confidence  \
0                                 (Ad Supported=True)  0.818238    1.000000   
1                           (Content Rating=Everyone)  0.818238    0.818238   
2                                 (Ad Supported=True)  0.998413    1.000000   
3                               (Editor Choice=False

In [8]:
# # Convert DataFrame to CSV string and then save
# frequent_itemsets_csv = frequent_itemsets.to_csv(index=False)
# with open('frequent_itemsets.csv', 'w', encoding='utf-8') as f:
#     f.write(frequent_itemsets_csv)

# rules_csv = rules.to_csv(index=False)
# with open('association_rules.csv', 'w', encoding='utf-8') as f:
#     f.write(rules_csv)

In [None]:
# Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of apps')
plt.xlabel('Feature 1 (scaled)')
plt.ylabel('Feature 2 (scaled)')
plt.legend()
plt.show()
# 5. Plot the clusters
plt.scatter(scaled_features[y_kmeans == 0, 0], scaled_features[y_kmeans == 0, 1], s=100, c='red', label='Cluster 1')
plt.scatter(scaled_features[y_kmeans == 1, 0], scaled_features[y_kmeans == 1, 1], s=100, c='blue', label='Cluster 2')
plt.scatter(scaled_features[y_kmeans == 2, 0], scaled_features[y_kmeans == 2, 1], s=100, c='green', label='Cluster 3')
optimal_k = 3  # Replace this with the K value found from the elbow method
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(scaled_features)

In [None]:
# 5. Plot the clusters
plt.scatter(scaled_features[y_kmeans == 0, 0], scaled_features[y_kmeans == 0, 1], s=100, c='red', label='Cluster 1')
plt.scatter(scaled_features[y_kmeans == 1, 0], scaled_features[y_kmeans == 1, 1], s=100, c='blue', label='Cluster 2')
plt.scatter(scaled_features[y_kmeans == 2, 0], scaled_features[y_kmeans == 2, 1], s=100, c='green', label='Cluster 3')

In [None]:
# Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of apps')
plt.xlabel('Feature 1 (scaled)')
plt.ylabel('Feature 2 (scaled)')
plt.legend()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

df = pd.read_csv("Playstore_final.csv")
df.describe(include="all")
def show_types(df):
    for col in df.columns:
        print(col, "=", df[col][0], "=", type(df[col][0]))


show_types(df)
for col in df.columns:
    if df[col].dtype != "float64":
        df.drop(col, axis=1, inplace=True)
df.columns

imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_imputed)
wcss = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

# Plot the elbow method result
plt.figure(figsize=(10, 7))
plt.plot(K, wcss, "bx-")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.title("Elbow Method For Optimal k")
plt.show()
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(df_scaled)
df["Cluster"] = kmeans.labels_
silhouette_avg = silhouette_score(df_scaled, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

for i in range (k):
    print(f"Cluster{i}")
    print(df["Cluster"].describe)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(df_pca, columns=["PCA1", "PCA2"])
df_pca["Cluster"] = df["Cluster"]

plt.figure(figsize=(10, 7))
sns.scatterplot(x="PCA1", y="PCA2", hue="Cluster", palette="viridis", data=df_pca)
plt.title("Clusters Visualization with PCA")
plt.show()
# Group the original DataFrame by the cluster labels to inspect each cluster's data
clusters = df.groupby('Cluster')

# Get the summary statistics for each cluster
cluster_summaries = {}
for cluster_num, cluster_data in clusters:
    cluster_summaries[cluster_num] = cluster_data.describe(include='all')

# Print the summary statistics for each cluster
for cluster_num, summary in cluster_summaries.items():
    print(f"Cluster {cluster_num} Summary:")
    print(summary)
    print("\n")