In [None]:
!pip install pyspark -q

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, desc
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from pyspark.sql.functions import col, approxCountDistinct
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
import ipywidgets as widgets
from IPython.display import display
import numpy as np


In [None]:
spark = SparkSession.builder.appName("AppStoreAnalysis").getOrCreate()

In [None]:
# Load the Dataset
data = spark.read.csv("appleAppData.csv", header=True, inferSchema=True)
data.show(5)

# Data Cleaning

In [None]:
# Data Exploration
data.printSchema()

In [None]:
# Displaying the number of rows and columns in the dataset
print(f"dataframe dimensions: {data.count()} rows x {len(data.columns)} columns.")

In [None]:
# Descriptive Statistics
data.describe().show(20)


In [None]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Unique Values Count") \
    .getOrCreate()

# Get the number of unique values in each column
unique_values_count = data.agg(*[approxCountDistinct(col(c)).alias(c) for c in data.columns])

# Show the number of unique values in each column
unique_values_count.show()

In [None]:
# Check for missing values and print the counts
missing_counts = data.select([F.count(when(col(c).isNull(), c)).alias(c) for c in data.columns]).collect()[0]
sorted_missing_counts = sorted(((colName, count) for colName, count in missing_counts.asDict().items()), key=lambda x: -x[1])

for colName, count in sorted_missing_counts:
    print(f"{colName}: {count}")

In [None]:
# Calculate and sort the percentage of missing values for each column in descending order
total_rows = data.count()
missing_percentages = [(colName, count / total_rows * 100) for colName, count in missing_counts.asDict().items()]
sorted_missing_percent = sorted(missing_percentages, key=lambda x: -x[1])

# Display the sorted percentages
for colName, percentage in sorted_missing_percent:
    print(f"{colName}: {percentage:.3f}%")

In [None]:
# Convert the sorted_missing_percent list into a Pandas DataFrame
missing_percent_df = pd.DataFrame(sorted_missing_percent, columns=["Column", "Missing Percentage"])

# Plot the missing percentages
plt.figure(figsize=(10, 6))
plt.barh(missing_percent_df["Column"], missing_percent_df["Missing Percentage"], color='blue')
plt.title('Missing Percentage of Columns')
plt.xlabel('Percentage')
plt.ylabel('Column')
plt.gca().invert_yaxis()  # Invert the y-axis to have the highest percentage at the top
plt.show()

In [None]:
# Drop 'Developer_Website' column
data = data.drop("Developer_Website")

# Data Processing

In [None]:
# Filter rows with the specified 'Content_Rating' values
valid_content_ratings = ['4+', '17+', '9+', '12+', 'Not yet rated']
data = data.filter(col("Content_Rating").isin(valid_content_ratings))

In [None]:
# Plot the distribution of Apps by Content Rating
content_rating_counts = data.groupBy("Content_Rating").count().toPandas()
content_rating_counts = content_rating_counts.sort_values(by="count", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=list(content_rating_counts.Content_Rating.values), y=list(content_rating_counts['count'].values), palette='mako')

plt.title('Distribution of Apps by Content Rating')
plt.ylabel('Number of Apps')
plt.xlabel('Content Rating')
plt.show()

In [None]:
content_rating_counts

In [None]:
# Analyze the distribution of apps across different age ratings
age_ratings = data.groupBy("Content_Rating").count()
age_ratings.show()

In [None]:
data = data.withColumn("Average_User_Rating", data["Average_User_Rating"].cast("double"))

In [None]:
# Calculate the average user ratings for each content rating
avg_user_ratings_age = data.groupBy("Content_Rating").avg("Average_User_Rating")
avg_user_ratings_age.show()

# Data Analysis

In [None]:
# Find the top 5 most popular genres based on the number of apps
top_genres = data.groupBy("Primary_Genre").count().orderBy(F.desc("count")).limit(5)
top_genres.show()

# Plot the results for the most popular genres
plt.figure(figsize=(10, 6))
sns.barplot(x="count", y="Primary_Genre", data=top_genres.toPandas())
plt.title("Top 5 Most Popular Genres")
plt.xlabel("Number of Apps")
plt.ylabel("Genre")
plt.show()

In [None]:
# Convert 'Size_Bytes' to DoubleType
data = data.withColumn("Size_Bytes", data["Size_Bytes"].cast("double"))
data = data.withColumn("Price", data["Price"].cast("double"))


# Filling the missing values for 'Size_Bytes' and 'Price'
median_size_bytes = data.approxQuantile("Size_Bytes", [0.5], 0.05)[0]
median_price = data.approxQuantile("Price", [0.5], 0.05)[0]

data = data.withColumn("Size_Bytes", when(col("Size_Bytes").isNull(), median_size_bytes).otherwise(col("Size_Bytes")))
data = data.withColumn("Price", when(col("Price").isNull(), median_price).otherwise(col("Price")))

In [None]:
# Create 'Size_MB' and 'Type' columns
data = data.withColumn("Size_MB", col("Size_Bytes") / (1024 * 1024))
data = data.withColumn("Type", when(col("Price") == 0, "Free").otherwise("Paid"))

In [None]:
# Calculate the average size of apps across different genres
avg_size_by_genre = data.groupBy("Primary_Genre").avg("Size_Bytes")
avg_size_by_genre.show()


In [None]:
# Compare app sizes between free and paid apps
size_comparison = data.groupBy("Free").avg("Size_Bytes")
size_comparison.show()

# Plot the results for app size comparison
plt.figure(figsize=(6, 6))
sns.barplot(x="Free", y="avg(Size_Bytes)", data=size_comparison.toPandas())
plt.title("App Size Comparison: Free vs. Paid")
plt.xlabel("Free (0: No, 1: Yes)")
plt.ylabel("Average Size (Bytes)")
plt.show()

In [None]:
# Count the number of free vs. paid apps and find the average price for paid apps
free_vs_paid = data.groupBy("Free").count()
average_price = data.filter(data["Price"] > 0).agg({"Price": "avg"}).collect()[0][0]
free_vs_paid.show()
print("Average Price for Paid Apps:", average_price)


In [None]:
# Find the genres with the highest average user ratings
avg_user_ratings = data.groupBy("Primary_Genre").avg("Average_User_Rating")
avg_user_ratings.show()

# Plot the results for genres with the highest average user ratings
plt.figure(figsize=(7, 6))
sns.barplot(x="avg(Average_User_Rating)", y="Primary_Genre", data=avg_user_ratings.toPandas())
plt.title("Genres with Highest Average User Ratings")
plt.xlabel("Average User Rating")
plt.ylabel("Genre")
plt.show()

In [None]:
# Find the developers who have released the most apps
top_developers = data.groupBy("Developer").count().orderBy(F.desc("count")).limit(5)
top_developers.show()

In [None]:
# Find the genres with the most frequently updated apps
most_updated_genres = data.groupBy("Primary_Genre").agg(F.countDistinct("App_Id").alias("AppsCount")).orderBy(F.desc("AppsCount")).limit(5)
most_updated_genres.show()

In [None]:
data.show(5)

In [None]:
# Top 10 installed app categories from the Apple Store
top_cat = data.groupBy('Primary_Genre').count().orderBy('count', ascending=False).limit(10).toPandas()

# Plot the top 10 app categories
plt.figure(figsize=(16, 6))
sns.barplot(data=top_cat, x='Primary_Genre', y='count')
plt.xlabel('App Categories')
plt.ylabel('Number of Apps')
plt.title('Top 10 Categories Installed from the Apple Store')
plt.show()

In [None]:
# Define a window specification for each content rating
window_spec = Window.partitionBy("Content_Rating").orderBy(desc("Price"))

# Find the highest paid app in each content rating
highest_paid_apps = data.withColumn("rank", F.rank().over(window_spec)).filter("rank == 1")

# Find the lowest paid app in each content rating
distinct_app_count = data.select(F.countDistinct("App_Name")).first()[0]
lowest_paid_apps = data.withColumn("rank", F.rank().over(window_spec)).filter(F.col("rank") == distinct_app_count)

# Show the highest and lowest paid apps in each content rating
print("Highest Paid Apps:")
highest_paid_apps.select("Content_Rating", "App_Name", "Price").show(truncate=False)

# Fitting the data to the models

In [None]:
sampled_data = data.sample(fraction=0.3, seed=123)  # Adjust the fraction as needed

# Convert the sampled DataFrame to pandas
sampled_df_pd = sampled_data.toPandas()

In [None]:
# Print column names
print(sampled_df_pd.columns)

# Check DataFrame structure
print(sampled_df_pd.head())

Index(['App_Id', 'App_Name', 'AppStore_Url', 'Primary_Genre', 'Content_Rating',
       'Size_Bytes', 'Required_IOS_Version', 'Released', 'Updated', 'Version',
       'Price', 'Currency', 'Free', 'DeveloperId', 'Developer',
       'Developer_Url', 'Average_User_Rating', 'Reviews',
       'Current_Version_Score', 'Current_Version_Reviews', 'Size_MB', 'Type'],
      dtype='object')
                                   App_Id                     App_Name  \
0                     com.hkbu.arc.apaper               A+ Paper Guide   
1        com.imonstersoft.azdictionaryios      A-Z Synonyms Dictionary   
2               com.kazo0.dailyreflection          AA Daily Reflection   
3        com.pitashi.readradio.aaspeakers  AA Speaker Tapes & 12 Steps   
4  com.partnergomobilephoneapps.bandbcafe                     B&B Cafe   

                                        AppStore_Url     Primary_Genre  \
0  https://apps.apple.com/us/app/a-paper-guide/id...         Education   
1  https://apps.apple.com

In [None]:
numeric_columns = sampled_df_pd.select_dtypes(include=['float64', 'int64']).columns


In [None]:
print(numeric_columns)

Index(['Size_Bytes', 'Price', 'Average_User_Rating', 'Current_Version_Score',
       'Current_Version_Reviews', 'Size_MB'],
      dtype='object')


In [None]:
X = sampled_df_pd.drop(columns=['Content_Rating'])
X = X[numeric_columns]
y = sampled_df_pd['Content_Rating']

In [None]:
# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [None]:
X_dropna = X.dropna()
y_dropna = y[X.index.isin(X_dropna.index)]

## KNN

In [None]:
knn_model = KNN(n_neighbors =5)
knn_model.fit(X_dropna,y_dropna)
knn_pred = knn_model.predict(X_dropna)


In [None]:
predicted_labels = knn_pred.astype(str)
cm = confusion_matrix(y_dropna, predicted_labels)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[  4046   1669  30089     45      0]
 [  1187   7952  49024     63      0]
 [  3105   6561 477518    179      0]
 [   449    833  13816    358      0]
 [     0      0      6      0      0]]


## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=123)  # You can adjust hyperparameters as needed
rf_model.fit(X_dropna, y_dropna)
rf_pred = rf_model.predict(X_dropna)

In [None]:
predicted_labels_rf = rf_pred.astype(str)
cm_rf = confusion_matrix(y_dropna, predicted_labels_rf)
print("Confusion Matrix with Random Forest Classifier:")
print(cm_rf)

Confusion Matrix with Random Forest Classifier:
[[ 24511    517  10774     47      0]
 [   365  34993  22784     84      0]
 [   807   2899 483501    156      0]
 [    56    151   3086  12163      0]
 [     0      0      1      0      5]]


## Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=123)  # You can adjust hyperparameters as needed
dt_model.fit(X_dropna, y_dropna)
dt_pred = dt_model.predict(X_dropna)

In [None]:
predicted_labels_dt = dt_pred.astype(str)
cm_dt = confusion_matrix(y_dropna, predicted_labels_dt)
print("Confusion Matrix with Decision Tree Classifier:")
print(cm_dt)

Confusion Matrix with Decision Tree Classifier:
[[ 22802    128   3841      4      0]
 [   851  29162   7025      4      0]
 [  3441   6386 284151     12      0]
 [   147    224   1686   8632      0]
 [     0      0      0      0      1]]


## User Interface

In [None]:
def predict_content_rating(size_bytes, price, average_user_rating, current_version_score, current_version_reviews, size_mb):
    # Predict content rating
    predicted_content_rating = dt_model.predict([[size_bytes, price, average_user_rating, current_version_score, current_version_reviews, size_mb]])
    print("Predicted Content Rating:", predicted_content_rating[0])

# Create input widgets for user input
size_bytes_input = widgets.FloatText(description="Size Bytes:")
price_input = widgets.FloatText(description="Price:")
average_user_rating_input = widgets.FloatText(description="Average User Rating:")
current_version_score_input = widgets.FloatText(description="Current Version Score:")
current_version_reviews_input = widgets.FloatText(description="Current Version Reviews:")
size_mb_input = widgets.FloatText(description="Size MB:")

# Create button to trigger prediction
predict_button = widgets.Button(description="Predict")

# Define function to handle button click event
def on_predict_button_clicked(b):
    predict_content_rating(
        size_bytes_input.value,
        price_input.value,
        average_user_rating_input.value,
        current_version_score_input.value,
        current_version_reviews_input.value,
        size_mb_input.value
    )

# Link button click event to function
predict_button.on_click(on_predict_button_clicked)

# Display widgets
display(size_bytes_input)
display(price_input)
display(average_user_rating_input)
display(current_version_score_input)
display(current_version_reviews_input)
display(size_mb_input)
display(predict_button)

FloatText(value=0.0, description='Size Bytes:')

FloatText(value=0.0, description='Price:')

FloatText(value=0.0, description='Average User Rating:')

FloatText(value=0.0, description='Current Version Score:')

FloatText(value=0.0, description='Current Version Reviews:')

FloatText(value=0.0, description='Size MB:')

Button(description='Predict', style=ButtonStyle())

Predicted Content Rating: 17+




Predicted Content Rating: 17+




In [None]:
# Stop the Spark session
spark.stop()