In [125]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, regexp_replace, regexp_extract

spark = SparkSession.builder.appName("Predict Phone Cost").getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [126]:
dataset_1 = spark.read.format("csv").option("sep", ",").load("datasets/Dataset_1.csv", header=True)
dataset_2 = spark.read.format("csv").option("sep", ",").load("datasets/Dataset_2.csv", header=True)
dataset_3 = spark.read.format("csv").option("sep", ",").load("datasets/Dataset_3.csv", header=True)
dataset_4 = spark.read.format("csv").option("sep", ",").load("datasets/Dataset_4.csv", header=True)

## Подготовка датасетов

In [127]:
# Удаляем столбцы, которые не будем использовать
dataset_1 = dataset_1\
    .withColumn("Phone Name", concat(dataset_1["Brand"], lit(" "), dataset_1["Model"]))\
    .drop("Brand")\
    .drop("Model")\
    .drop("Screen Size (inches)")
columns = dataset_1.columns
columns.remove("Phone Name")
columns.insert(0, "Phone Name")
dataset_1 = dataset_1.select(columns)

In [129]:
# Приводим столбцы к нужному нам виду
dataset_1 = dataset_1\
    .withColumn("Storage", regexp_replace(dataset_1["Storage"], " GB", ""))\
    .withColumn("RAM", regexp_replace(dataset_1["RAM"], " GB", ""))\
    .withColumn("Camera", regexp_extract(dataset_1["Camera"], r"\d+", 0))
# Приведём столбцы к нужному типу
dataset_1 = dataset_1\
    .withColumn("Storage", dataset_1["Storage"].cast("integer"))\
    .withColumn("RAM", dataset_1["RAM"].cast("integer"))\
    .withColumn("Camera", dataset_1["Camera"].cast("integer"))\
    .withColumn("Battery Capacity", dataset_1["Battery Capacity"].cast("integer"))\
    .withColumn("Price", dataset_1["Price"].cast("integer"))

dataset_1.printSchema()

root
 |-- Phone Name: string (nullable = true)
 |-- Storage: integer (nullable = true)
 |-- RAM: integer (nullable = true)
 |-- Screen Size: string (nullable = true)
 |-- Camera: integer (nullable = true)
 |-- Battery Capacity: integer (nullable = true)
 |-- Price: integer (nullable = true)



In [128]:
dataset_2 = dataset_2\
    .drop("Unnamed: 0")\
    .withColumnRenamed("Brand me", "Phone Name")
dataset_2

Phone Name,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price
"LG V30+ (Black, 1...",4.3,4.0,128.0,6.0,48,13.0,4000,24999
I Kall K11,3.4,6.0,64.0,4.5,48,12.0,4000,15999
Nokia 105 ss,4.3,4.0,4.0,4.5,64,16.0,4000,15000
Samsung Galaxy A5...,4.4,6.0,64.0,6.4,48,15.0,3800,18999
POCO F1 (Steel Bl...,4.5,6.0,128.0,6.18,35,15.0,3800,18999
Apple iPhone 11 P...,4.7,8.0,128.0,5.8,35,12.0,5000,140300
Samsung Galaxy A7...,4.4,8.0,128.0,6.7,64,5.0,4700,29999
Samsung Galaxy S1...,4.5,8.0,128.0,6.7,48,12.0,4700,47999
OPPO A9 (Marble G...,4.4,4.0,128.0,6.53,48,2.0,4020,16490
POCO F1 (Graphite...,4.5,8.0,256.0,6.18,35,5.0,3800,22999
