The Chi-Square test is a statistical procedure for determining the difference between observed and expected data. This test can also be used to decide whether it correlates to our data's categorical variables. It helps to determine whether a difference between two categorical variables is due to chance or a relationship between them.

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import ChiSquareTest

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("sample_data.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+---+---------+------------+-----+
|age|city_code|country_code|label|
+---+---------+------------+-----+
| 51|       35|          14|    0|
| 49|        3|          14|    0|
| 47|       32|          13|    0|
| 46|       31|          15|    0|
|  5|       36|          14|    0|
| 54|       39|          17|    0|
| 46|       34|          14|    0|
|  5|       34|          15|    0|
| 44|       29|          14|    0|
| 49|       31|          15|    0|
| 54|       37|          15|    0|
| 48|       34|          16|    0|
| 48|        3|          14|    0|
| 43|        3|          11|    0|
| 58|        4|          12|    0|
| 57|       44|          15|    0|
| 54|       39|          13|    0|
| 51|       35|          14|    0|
| 57|       38|          17|    0|
| 51|       38|          15|    0|
+---+---------+------------+-----+
only showing top 20 rows



In [5]:
df.count()

150

In [6]:
df.columns

['age', 'city_code', 'country_code', 'label']

In [7]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- city_code: integer (nullable = true)
 |-- country_code: integer (nullable = true)
 |-- label: integer (nullable = true)



In [8]:
assembler = VectorAssembler(
    inputCols=['age', 'city_code', 'country_code'],
    outputCol="vector_features")

In [9]:
vectorized_df = assembler.transform(df).select('label', 'vector_features')

In [10]:
vectorized_df.show()

+-----+----------------+
|label| vector_features|
+-----+----------------+
|    0|[51.0,35.0,14.0]|
|    0| [49.0,3.0,14.0]|
|    0|[47.0,32.0,13.0]|
|    0|[46.0,31.0,15.0]|
|    0| [5.0,36.0,14.0]|
|    0|[54.0,39.0,17.0]|
|    0|[46.0,34.0,14.0]|
|    0| [5.0,34.0,15.0]|
|    0|[44.0,29.0,14.0]|
|    0|[49.0,31.0,15.0]|
|    0|[54.0,37.0,15.0]|
|    0|[48.0,34.0,16.0]|
|    0| [48.0,3.0,14.0]|
|    0| [43.0,3.0,11.0]|
|    0| [58.0,4.0,12.0]|
|    0|[57.0,44.0,15.0]|
|    0|[54.0,39.0,13.0]|
|    0|[51.0,35.0,14.0]|
|    0|[57.0,38.0,17.0]|
|    0|[51.0,38.0,15.0]|
+-----+----------------+
only showing top 20 rows



In [11]:
result= ChiSquareTest.test(vectorized_df, "vector_features", "label").head()

In [14]:
print("pvalues: " + str(result.pValues))

pvalues: [0.053548382417787366,0.007494454436545395,9.393896771570098e-11]


In [15]:
print("degreesOfFreedom: " + str(result.degreesOfFreedom))

degreesOfFreedom: [34, 22, 42]


In [16]:
print("test statistics: " + str(result.statistics))

test statistics: [48.25357142857142,41.345554445554455,128.85000000000002]
