In [1]:
import pandas as pd
import numpy as np
import random

# Set a seed for reproducibility
np.random.seed(42)
random.seed(42)

# --- Feature Definitions and Probabilities ---
manufacturers = ['Samsung', 'Apple', 'Google', 'Xiaomi', 'OnePlus', 'Motorola', 'Oppo', 'Vivo']
os_types = ['Android', 'iOS']
network_types = ['4G LTE', '5G']
screen_sizes = np.arange(6.0, 7.0, 0.1) # 6.0 to 6.9 inches

# Define common configuration buckets to link price and specs
spec_tiers = {
    'low': {'storage': [64, 128], 'ram': [4, 6], 'price_range': (200, 400), 'score_range': (40, 60), 'network_prob': 0.8}, # 80% 4G
    'mid': {'storage': [128, 256], 'ram': [8, 12], 'price_range': (401, 750), 'score_range': (61, 80), 'network_prob': 0.5}, # 50% 5G
    'high': {'storage': [256, 512, 1024], 'ram': [12, 16], 'price_range': (751, 1500), 'score_range': (81, 95), 'network_prob': 0.2} # 80% 5G
}

# --- Data Generation ---
data = []
for i in range(200):
    # 1. Randomly assign a tier, weighted towards mid-range
    tier = np.random.choice(list(spec_tiers.keys()), p=[0.2, 0.6, 0.2])
    
    # 2. Get specs based on the chosen tier
    config = spec_tiers[tier]
    
    # Manufacturer and OS
    manufacturer = random.choice(manufacturers)
    # Ensure Apple only gets iOS
    if manufacturer == 'Apple':
        os = 'iOS'
        tier = 'high' # Force Apple to be high-tier for price realism
    else:
        os = 'Android'

    # Numerical Features (linked to tier)
    storage_gb = random.choice(config['storage'])
    ram_gb = random.choice(config['ram'])
    price = random.randint(config['price_range'][0], config['price_range'][1])
    
    # Performance score (your target/dependent variable)
    performance_score = random.randint(config['score_range'][0], config['score_range'][1])

    # Other features
    screen = round(random.choice(screen_sizes), 1)
    
    # Network (weighted based on tier, high tier favors 5G)
    if tier == 'high':
        network = '5G' if random.random() > config['network_prob'] else '4G LTE' # Low prob of 4G
    elif tier == 'low':
        network = '4G LTE' if random.random() < config['network_prob'] else '5G' # High prob of 4G
    else: # mid-range
        network = random.choice(network_types)

    # 3. Create a unique, descriptive phone name
    phone_name = f"{manufacturer} {os.replace('iOS', 'Pro Max').replace('Android', 'Galaxy')}_{i}"
    
    data.append([phone_name, manufacturer, os, storage_gb, ram_gb, screen, network, price, performance_score])

# --- Create DataFrame ---
columns = [
    'Phone Name', 'Manufacturer', 'OS', 'Storage (GB)', 'RAM (GB)', 
    'Screen Size (in)', 'Internet Network', 'Price (USD)', 'Performance Score'
]
df_phones = pd.DataFrame(data, columns=columns)

# --- Output Summary ---
print("✅ Phone Dataset Generated Successfully!")
print("-" * 50)
print(f"Total Rows: {len(df_phones)}")
print("\nFirst 5 Rows:")
print(df_phones.head())
print("-" * 50)
print("Data Types (Required for Sklearn):")
print(df_phones.dtypes)

# Save to a CSV file (optional, but good practice for training)
# df_phones.to_csv('synthetic_phones_data.csv', index=False)

✅ Phone Dataset Generated Successfully!
--------------------------------------------------
Total Rows: 200

First 5 Rows:
          Phone Name Manufacturer       OS  Storage (GB)  RAM (GB)  \
0    Apple Pro Max_0        Apple      iOS           128        12   
1    Apple Pro Max_1        Apple      iOS          1024        16   
2   Samsung Galaxy_2      Samsung  Android           128        12   
3   Samsung Galaxy_3      Samsung  Android           128        12   
4  Motorola Galaxy_4     Motorola  Android            64         4   

   Screen Size (in) Internet Network  Price (USD)  Performance Score  
0               6.2               5G          526                 68  
1               6.1               5G          783                 81  
2               6.9               5G          513                 75  
3               6.2           4G LTE          575                 69  
4               6.5               5G          297                 43  
-------------------------------

In [2]:
df_phones.to_csv("phones.csv",index=False)