# Univariate Analysis

## 1. Numerical Variables

In [None]:
# Samae as pandas, PySpark also use .describe to show information
num_cols = ['Account Balance','No of dependents']
df.select(num_cols).describe().show()

In [1]:
# As PySpark does not include the quartiles, crate a function to get the same results in pandas
def describe_pd(df_in, columns, deciles=False):
  '''
  Function to union the basic stats results and deciles
  :param df_in: the input dataframe
  :param columns: the cloumn name list of the numerical variable
  :param deciles: the deciles output
  :return : the numerical describe info. of the input dataframe
  '''
  import numpy as np
  import pandas as pd

  
  if deciles:
    percentiles = np.array(range(0, 110, 10))
  else:
    percentiles = [25, 50, 75]

  percs = np.transpose([np.percentile(df_in.select(x).collect(),percentiles) for x in columns])
  percs = pd.DataFrame(percs, columns=columns)
  percs['summary'] = [str(p) + '%' for p in percentiles]
  spark_describe = df_in.describe().toPandas()
  new_df = pd.concat([spark_describe, percs],ignore_index=True)
  new_df = new_df.round(2)
  return new_df[['summary'] + columns]

## 2. Categorical Variables

### 2.1 Frequency Table

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import rank,sum,col
from pyspark.sql import Window
window = Window.rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
# withColumn('Percent %',F.format_string("%5.0f%%\n",col('Credit_num')*100/col('total'))).
tab = df.select(['age_class','Credit Amount']).\
groupBy('age_class').\
agg(F.count('Credit Amount').alias('Credit_num'),
F.mean('Credit Amount').alias('Credit_avg'),
F.min('Credit Amount').alias('Credit_min'),
F.max('Credit Amount').alias('Credit_max')).\
withColumn('total',sum(col('Credit_num')).over(window)).\
withColumn('Percent',col('Credit_num')*100/col('total')).\
drop(col('total'))

---

# Multivariate Analysis

## 1. Numerical vs. Numerical

### 1.1 Correlation matrix

In [None]:
from pyspark.mllib.stat import Statistics
import pandas as pd

corr_data = df.select(num_cols)

col_names = corr_data.columns
features = corr_data.rdd.map(lambda row: row[0:])
corr_mat=Statistics.corr(features, method="pearson")

corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names

print(corr_df.to_string())

## 2. Categorical vs. Categorical

### 2.1 Pearson's Chi-squared test

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest
data = [(0.0, Vectors.dense(0.5, 10.0)),
(0.0, Vectors.dense(1.5, 20.0)),
(1.0, Vectors.dense(1.5, 30.0)),
(0.0, Vectors.dense(3.5, 30.0)),
(0.0, Vectors.dense(3.5, 40.0)),
(1.0, Vectors.dense(3.5, 40.0))]
df = spark.createDataFrame(data, ["label", "features"])
r = ChiSquareTest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

### Cross table

In [None]:
df.stat.crosstab("age_class", "Occupation").show()

## Numerical vs. Categorical