In [67]:
from pyspark.sql import SparkSession

In [68]:
spark = SparkSession \
    .builder \
    .appName('Desafio Modulo 2 - ML') \
    .getOrCreate()

spark.version

'3.5.1'

In [69]:
clidata_df = spark.read.csv('/home/pericles/Spark-Projects/IGTI-DESM2/stroke_data.csv', header='True', inferSchema='True')

clidata_df.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [70]:
mean_age = clidata_df.agg({'age': 'mean'}).collect()[0][0]
mean_avg_glucose_leve = clidata_df.agg({'avg_glucose_level': 'mean'}).collect()[0][0]
mean_bmi = clidata_df.agg({'bmi': 'mean'}).collect()[0][0]

In [71]:
clidata_df = clidata_df.fillna(mean_age, subset=['age'])
clidata_df = clidata_df.fillna(mean_avg_glucose_leve, subset=['avg_glucose_level'])
clidata_df = clidata_df.fillna(mean_bmi, subset=['bmi'])

In [72]:
clidata_df.describe().show()

[Stage 152:>                                                        (0 + 1) / 1]

+-------+------------------+------+------------------+-------------------+-------------------+------------+---------+--------------+------------------+------------------+---------------+-------------------+
|summary|                 0|gender|               age|       hypertension|      heart_disease|ever_married|work_type|Residence_type| avg_glucose_level|               bmi| smoking_status|             stroke|
+-------+------------------+------+------------------+-------------------+-------------------+------------+---------+--------------+------------------+------------------+---------------+-------------------+
|  count|             67135| 67135|             67135|              67135|              67135|       67135|    67135|         67135|             67135|             67135|          67135|              67135|
|   mean|           33568.0|  NULL| 51.95950845311693|0.16410218217025396|0.10142250688910405|        NULL|     NULL|          NULL|113.41439606762462| 29.16154047813857|  

                                                                                

In [73]:
clidata_df.select('gender', 'hypertension', 'heart_disease', 'ever_married').distinct().show()

+------+------------+-------------+------------+
|gender|hypertension|heart_disease|ever_married|
+------+------------+-------------+------------+
|  Male|           0|            0|          No|
|  Male|           0|            1|         Yes|
| Other|           0|            0|         Yes|
|  Male|           1|            0|         Yes|
|Female|           0|            1|         Yes|
|  Male|           1|            1|          No|
|Female|           0|            0|         Yes|
|Female|           0|            1|          No|
|  Male|           1|            1|         Yes|
|Female|           0|            0|          No|
|Female|           1|            1|         Yes|
|  Male|           0|            0|         Yes|
|Female|           1|            0|         Yes|
|  Male|           0|            1|          No|
|Female|           1|            0|          No|
|Female|           1|            1|          No|
|  Male|           1|            0|          No|
| Other|           0

In [74]:
clidata_df.select('work_type', 'Residence_type').distinct().show()

+-------------+--------------+
|    work_type|Residence_type|
+-------------+--------------+
|      Private|         Urban|
|     children|         Urban|
|     Govt_job|         Rural|
|Self-employed|         Rural|
|     Govt_job|         Urban|
|      Private|         Rural|
| Never_worked|         Rural|
|     children|         Rural|
| Never_worked|         Urban|
|Self-employed|         Urban|
+-------------+--------------+



In [75]:
clidata_df.select('smoking_status', 'stroke').distinct().show()

+---------------+------+
| smoking_status|stroke|
+---------------+------+
|   never_smoked|     0|
|formerly smoked|     1|
|         smokes|     1|
|formerly smoked|     0|
|   never_smoked|     1|
|         smokes|     0|
+---------------+------+



In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [76]:
clidata_df.printSchema()

root
 |-- 0: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = false)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = false)
 |-- bmi: double (nullable = false)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [80]:
gender_indexer = StringIndexer(inputCol='gender', outputCol='gender_indexer')
gender_indexer.fit(clidata_df).transform(clidata_df)

gender_encoder = OneHotEncoder(inputCol='gender_indexer', outputCol='gender_vector')
gender_indexer_model = gender_indexer.fit(clidata_df).transform(clidata_df)
gender_encoder.fit(gender_indexer_model).transform(gender_indexer_model)

DataFrame[0: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: double, smoking_status: string, stroke: int, gender_indexer: double, gender_vector: vector]

In [85]:
emarried_indexer = StringIndexer(inputCol='ever_married', outputCol='emarried_indexer')
emarried_indexer.fit(clidata_df).transform(clidata_df)

emarried_encoder = OneHotEncoder(inputCol='emarried_indexer', outputCol='emarried_vector')
emarried_indexer_model = emarried_indexer.fit(clidata_df).transform(clidata_df)
emarried_encoder.fit(emarried_indexer_model).transform(emarried_indexer_model)

DataFrame[0: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: double, smoking_status: string, stroke: int, emarried_indexer: double, emarried_vector: vector]

In [86]:
wtype_indexer = StringIndexer(inputCol='work_type', outputCol='wtype_indexer')
wtype_indexer.fit(clidata_df).transform(clidata_df)

wtype_encoder = OneHotEncoder(inputCol='wtype_indexer', outputCol='wtype_vector')
wtype_indexer_model = wtype_indexer.fit(clidata_df).transform(clidata_df)
wtype_encoder.fit(wtype_indexer_model).transform(wtype_indexer_model)

DataFrame[0: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: double, smoking_status: string, stroke: int, wtype_indexer: double, wtype_vector: vector]

In [87]:
rtype_indexer = StringIndexer(inputCol='Residence_type', outputCol='rtype_indexer')
rtype_indexer.fit(clidata_df).transform(clidata_df)

rtype_encoder = OneHotEncoder(inputCol='rtype_indexer', outputCol='rtype_vector')
rtype_indexer_model = rtype_indexer.fit(clidata_df).transform(clidata_df)
rtype_encoder.fit(rtype_indexer_model).transform(rtype_indexer_model)

DataFrame[0: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: double, smoking_status: string, stroke: int, rtype_indexer: double, rtype_vector: vector]

In [88]:
sstatus_indexer = StringIndexer(inputCol='smoking_status', outputCol='sstatus_indexer')
sstatus_indexer.fit(clidata_df).transform(clidata_df)

sstatus_encoder = OneHotEncoder(inputCol='sstatus_indexer', outputCol='sstatus_vector')
sstatus_indexer_model = sstatus_indexer.fit(clidata_df).transform(clidata_df)
sstatus_encoder.fit(sstatus_indexer_model).transform(sstatus_indexer_model)

DataFrame[0: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: double, smoking_status: string, stroke: int, sstatus_indexer: double, sstatus_vector: vector]