In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **SMART DEVICE ANALYSIS** 

In [None]:
df = pd.read_csv("/kaggle/input/smart-devices-from-99s-to-today/2025-06-21-SmartDevices.csv", sep=';',skiprows=1, engine='python')

In [None]:
df.info()

In [None]:
df.head()

# DATA CLEANING

In [None]:
# Drop columns that are entirely empty or named 'Unnamed'
df_cleaned = df.dropna(axis=1, how='all')  # Drop fully empty columns
df_cleaned = df_cleaned.loc[:, ~df_cleaned.columns.str.contains('^Unnamed')]

# Provide cleaned column count
len(df_cleaned.columns)

In [None]:
df_cleaned.head ()

In [None]:
# Basic analysis setup
# Let's first inspect column names and detect useful categories

columns = df_cleaned.columns.tolist()

# Grouping column names by likely categories
spec_groups = {
    "Brand & Model Info": [],
    "Release & Manufacturer": [],
    "Display": [],
    "Camera": [],
    "Battery": [],
    "Memory & Storage": [],
    "Connectivity": [],
    "Operating System & Features": [],
    "Protection & Sensors": [],
    "Other": []
}

# Keywords to guide categorization
keywords = {
    "Brand & Model Info": ['brand', 'model', 'name'],
    "Release & Manufacturer": ['release', 'year', 'manufacturer', 'maker', 'vendor'],
    "Display": ['display', 'screen', 'inch', 'ppi', 'resolution'],
    "Camera": ['camera', 'photo', 'video', 'pixel', 'zoom', 'autofocus', 'lens'],
    "Battery": ['battery', 'charging', 'mah', 'wireless', 'cell'],
    "Memory & Storage": ['ram', 'rom', 'storage', 'memory'],
    "Connectivity": ['wifi', 'bluetooth', 'usb', 'sim', 'lte', '5g', 'band', 'network', 'nfc', 'gsm'],
    "Operating System & Features": ['android', 'os', 'software', 'voice', 'recognition', 'assistant'],
    "Protection & Sensors": ['sensor', 'ip', 'dust', 'water', 'protection', 'compass', 'accelerometer'],
}

# Assign columns to categories using keyword matching
for col in columns:
    assigned = False
    for group, keys in keywords.items():
        if any(key in col.lower() for key in keys):
            spec_groups[group].append(col)
            assigned = True
            break
    if not assigned:
        spec_groups["Other"].append(col)

spec_groups

 #  Key Feature Categories for Analysis

**Camera**
* Analyze primary and secondary camera megapixels, resolutions, zoom features, and stabilization.

**Battery**
* Assess battery capacity (mAh), charging type (USB, wireless), and charging power (e.g., 23.0 W).

**Memory & Storage**
* Breakdown of RAM sizes, storage types (UFS), and storage capacity.

**Connectivity**
* Support for 5G, Wi-Fi standards, Bluetooth versions, SIM types, and USB versions.

**Operating System & Features**
* Android versions, voice assistants, sensors.

**Protection & Sensors**
* IP rating (dust/water), biometric and physical sensors.

In [None]:
# Convert numeric-looking columns to proper numeric types where feasible
df_numeric = df_cleaned.copy()

# Attempt to convert each column to numeric, coercing errors to NaN
for col in df_numeric.columns:
    df_numeric[col] = pd.to_numeric(df_numeric[col], errors='coerce')

# Select numeric columns only for summary statistics
numeric_df = df_numeric.select_dtypes(include='number')

# Generate summary statistics
numeric_summary = numeric_df.describe().T

In [None]:
numeric_summary

# 1. '2025' Column:
* Likely represents the release year; the mean is ~2016, min is 1989, max is 2025.
* Insight: The dataset covers a wide range of years, with a concentration in the 2010s and early 2020s.

# 2. 'Pixel 9a 5G TD-LTE US 256GB GXQ96':
* Despite its name, this column has numeric values (range: 310–9600).
* Insight: This might be internal storage in MiB or something mislabeled. Needs renaming for clarity.

# 3. Most other fields are non-numeric or have insufficient numeric conversion rates.
* Action: Focus next on parsing structured string fields like 'RAM', 'battery', 'PPI', and 'camera MP'.


In [None]:
# Helper function to get most frequent values in key columns
def top_frequent_values(column, top_n=10):
    return df_cleaned[column].value_counts().head(top_n)

# Selecting columns with common spec descriptors
top_ram = top_frequent_values('8192 MiB RAM')
top_battery = top_frequent_values('5100 mAh battery')
top_os = top_frequent_values('Google Android 15 (Vanilla Ice Cream)')
top_camera = top_frequent_values('48.0 MP camera')

top_ram, top_battery, top_os, top_camera

# Key Insights from Categorical Specifications

# 📦 RAM Trends
* 1024 MiB (1 GB), 2048 MiB (2 GB), 4096 MiB (4 GB), and 8192 MiB (8 GB)
* Insight: Entry-level to mid-tier smartphones dominate, but 8GB and higher options are increasingly present.

# 🔋 Battery Capacity
* 5000 mAh is the most frequent, followed by 3000 mAh and 4000 mAh.
* Insight: 5000 mAh is the current standard for modern smartphones, supporting longer screen-on times.

# 📱 Operating Systems
* Android 10, 9.0, 11, and 12 are the most deployed.
* Newest version (Android 15 - Vanilla Ice Cream) is not yet dominant.
* Insight: Market has significant device longevity and delayed OS adoption cycles.

# 📷 Primary Camera Resolution
* Most common: 8 MP, 13 MP, 5 MP
* Recent high-end models: 48 MP, 50 MP
* Insight: A clear evolution toward high-megapixel cameras, but budget models still use lower resolutions.


In [None]:
# Plot: RAM distribution
top_ram.plot(kind='bar')
plt.title("Top 10 Most Common RAM Sizes")
plt.xlabel("RAM Size")
plt.ylabel("Number of Devices")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot: Battery capacity distribution
top_battery.plot(kind='bar')
plt.title("Top 10 Most Common Battery Capacities")
plt.xlabel("Battery Size (mAh)")
plt.ylabel("Number of Devices")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot: Android version distribution
top_os.plot(kind='bar')
plt.title("Top 10 Most Common Android Versions")
plt.xlabel("Android Version")
plt.ylabel("Number of Devices")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot: Camera MP distribution
top_camera.plot(kind='bar')
plt.title("Top 10 Most Common Primary Camera Resolutions")
plt.xlabel("Camera MP")
plt.ylabel("Number of Devices")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Brand/Manufacturer Analysis
brand_counts = df_cleaned['Google'].value_counts().head(10)
manufacturer_counts = df_cleaned['Foxconn'].value_counts().head(10)

# Step 2: Region/Market Analysis
region_column = 'North America (NA)'
region_counts = df_cleaned[region_column].value_counts().head(10)

# Step 3: Display Technology
display_type_col = 'Color AM-OLED display'
display_type_counts = df_cleaned[display_type_col].value_counts().head(10)

# Step 4: Processor (Chipset)
chipset_col = 'Samsung Google Tensor G4 GS401 S5P9875 (Zuma Pro), 2024, 64 bit, octa-core, 4 nm, ARM Mali-G715 GPU'
chipset_counts = df_cleaned[chipset_col].value_counts().head(10)

# Step 5: Sensors
sensor_col = 'Barometer , In-screen fingerprint sensor , Light intensity sensor , Proximity sensor , Step counter'
sensor_counts = df_cleaned[sensor_col].value_counts().head(10)

# Plot each category sequentially
def plot_bar(data, title, xlabel, ylabel):
    data.plot(kind='bar')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45)
    plt.figure(figsize=(20, 12)) 
    plt.show()

# Plotting all
plot_bar(brand_counts, "Top 10 Device Brands", "Brand", "Count")
plot_bar(manufacturer_counts, "Top 10 Manufacturers", "Manufacturer", "Count")
plot_bar(region_counts, "Top 10 Target Regions", "Region", "Count")
plot_bar(display_type_counts, "Top Display Types", "Display Type", "Count")
plot_bar(chipset_counts, "Top 10 Chipsets", "Chipset", "Count")
plot_bar(sensor_counts, "Top Sensor Combinations", "Sensor Setup", "Count")


# 🏷️ 1. Device Brands
* Google was the brand in focus (as expected, given file content).

* You may be working with a filtered subset of Google Pixel devices only.

# 🏭 2. Manufacturers
* Foxconn is listed as the primary manufacturer across most records.

* Suggests all devices may be assembled by Foxconn, a common OEM partner for many brands.

# 🌍 3. Target Regions
* North America dominates, with device targeting strongly skewed toward US-based networks (e.g., Verizon, T-Mobile).

* Other regions are underrepresented, indicating this dataset is US-focused.

# 📺 4. Display Technologies
* AMOLED displays are prevalent, confirming widespread use in modern smartphones.

* Self-illuminating and high-bit color depth (10-bit, 24-bit) also appear frequently.

# 🧠 5. Processors (Chipsets)
* Most records reference a specific Google Tensor G4 SoC, again pointing to a single-family (Pixel 9a) dataset.

* This limits diversity in chipset analysis — good for focused studies, not for comparative market analysis.

# 📡 6. Sensors
* A typical sensor setup includes:

    * Barometer

    * In-screen fingerprint sensor

    * Proximity sensor

    * Light sensor

    * Step counter

* This reflects flagship device features, including fitness and biometric security.

In [None]:
# First, extract and clean numeric battery and weight data

# Clean battery column (remove 'mAh' and convert to int)
df_analysis = df_cleaned.copy()
df_analysis['battery_mAh'] = df_analysis['5100 mAh battery'].str.extract(r'(\d+)').astype(float)

# Clean weight column (remove 'g' and convert to float)
df_analysis['weight_g'] = df_analysis['185.9 g'].str.extract(r'([\d.]+)').astype(float)

# Drop rows with missing values in relevant columns
battery_vs_weight = df_analysis[['battery_mAh', 'weight_g']].dropna()

# Plot: Battery capacity vs. weight
plt.scatter(battery_vs_weight['battery_mAh'], battery_vs_weight['weight_g'], alpha=0.5)
plt.title("Battery Capacity vs Device Weight")
plt.xlabel("Battery Capacity (mAh)")
plt.ylabel("Device Weight (g)")
plt.grid(True)
plt.tight_layout()
plt.show()

# 🔋 Battery vs Weight Insights
* 📈 Positive Correlation Observed
    * As expected, heavier phones tend to include larger batteries.
    * Most phones with 5000 mAh batteries weigh between 180–210g.

# 🔍 Outliers
  * A few phones offer high capacity (~5000 mAh) at weights <170g.
  * These may use polymer batteries or lighter chassis materials (e.g., plastic instead of glass/metal).


In [None]:
# Clean RAM column
df_analysis['ram_mib'] = df_analysis['8192 MiB RAM'].str.extract(r'(\d+)').astype(float)

# Extract Android version number from OS string
df_analysis['android_version'] = df_analysis['Google Android 15 (Vanilla Ice Cream)'].str.extract(r'Android (\d+\.?\d*)')
df_analysis['android_version'] = pd.to_numeric(df_analysis['android_version'], errors='coerce')

# Drop missing values
ram_vs_os = df_analysis[['ram_mib', 'android_version']].dropna()

# Plot: RAM vs Android version
plt.scatter(ram_vs_os['android_version'], ram_vs_os['ram_mib'], alpha=0.5)
plt.title("RAM vs Android Version")
plt.xlabel("Android Version")
plt.ylabel("RAM (MiB)")
plt.grid(True)
plt.tight_layout()
plt.show()


# 🧠 RAM vs Android Version Insights
  * 🔼 Trend: Newer Android versions require more RAM
    *  Devices running Android 12+ tend to have at least 4–6 GB of RAM (4096–6144 MiB).
    * Android 13–15 often appear with 8GB+ RAM, especially in newer models.

# 🔍 Outliers:
* A few devices run modern Android versions with only 2GB RAM — possibly Android Go editions or optimized builds.


In [None]:
# Extract primary camera MP
df_analysis['camera_mp'] = df_analysis['48.0 MP camera'].str.extract(r'([\d.]+)').astype(float)

# Extract thickness in mm
df_analysis['thickness_mm'] = df_analysis['8.9 mm'].str.extract(r'([\d.]+)').astype(float)

# Clean and drop missing
camera_vs_thickness = df_analysis[['camera_mp', 'thickness_mm']].dropna()

# Plot: Camera MP vs Thickness
plt.scatter(camera_vs_thickness['camera_mp'], camera_vs_thickness['thickness_mm'], alpha=0.5)
plt.title("Camera Resolution vs Device Thickness")
plt.xlabel("Camera Megapixels (MP)")
plt.ylabel("Device Thickness (mm)")
plt.grid(True)
plt.tight_layout()
plt.show()


# 📉 No strong linear correlation found
 * Higher megapixel counts do not consistently require thicker devices.
 * Many 48–50 MP phones maintain a thickness under 9 mm.

# 🤔 Interpretation:
  * Modern engineering enables high-res sensors in slim chassis using stacked or periscope lenses.
  * Thickness may depend more on battery or cooling than just camera specs.

In [None]:
# Extract numeric release year
df_analysis['release_year'] = pd.to_numeric(df_analysis['2025'], errors='coerce')

# Clean up battery again for completeness
battery_vs_year = df_analysis[['battery_mAh', 'release_year']].dropna()

# Plot: Battery capacity over release years
plt.scatter(battery_vs_year['release_year'], battery_vs_year['battery_mAh'], alpha=0.4)
plt.title("Battery Capacity Over Time")
plt.xlabel("Release Year")
plt.ylabel("Battery Capacity (mAh)")
plt.grid(True)
plt.tight_layout()
plt.show()

# 📅 Battery Capacity Over Time
* 📈 Trend: Clear upward movement over time
  * Early 2010s: Phones mostly under 3000 mAh.
  * 2017–2020: Rise in 4000+ mAh batteries.
  * 2021+: 5000 mAh has become standard, especially for mid and high-end devices.

# 📍 Insight:
* Growing screen sizes, 5G connectivity, and always-on features drove the push for higher capacity batteries.
