In [122]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType, DateType #tipos de datos para elaboracion del schema
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import lit
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [89]:
spark = SparkSession.builder.appName("session").getOrCreate() #creando la sesion de spark

In [90]:
#funcion que retorna PySpark Schema (lista de schemas de las columnas)
def schema(cols_schema: dict) -> StructType:
    """Define the schema of the dataframe

    Args:
        cols_schema (dict): Dictionary with the columns and its types

    Returns:
        StructType: Schema of the dataframe
    """
    return StructType([StructField(key,
                                cols_schema[key][0],
                                cols_schema[key][1]) for key in cols_schema.keys()])

In [91]:
#definiendo el esquema del set de datos
schema = StructType(
    [
        StructField("person_age",DoubleType(),True),
        StructField("person_income",DoubleType(),True),
        StructField("person_home_ownership",StringType(),True),
        StructField("person_emp_length",DoubleType(),True),
        StructField("loan_intent",StringType(),True),
        StructField("loan_grade",StringType(),True),
        StructField("loan_amnt",DoubleType(), True),
        StructField("loan_int_rate",DoubleType(), True),
        StructField("loan_status",DoubleType(), True),
        StructField("loan_percent_income",DoubleType(), True),
        StructField("cb_person_default_on_file",StringType(), True),
        StructField("cb_person_cred_hist_length",DoubleType(),True),
    ]
)

In [92]:
df = spark.read.csv("./creditInfo.csv", header=True, sep = ",", schema = schema)

In [93]:
cat_columns = [column for column, type in df.dtypes if type in ['string']]

In [94]:
num_data = df
for cat in cat_columns:
    encoder = StringIndexer(inputCol=cat, outputCol=f"num_{cat}")
    num_data = encoder.fit(num_data).transform(num_data)

In [95]:
num_data = num_data.dropna()

In [96]:
num_cols = [column for column, type in num_data.dtypes if type in ['double'] and column != "loan_grade"]
df_assembled = VectorAssembler(inputCols = num_cols, outputCol="features").transform(num_data)

In [97]:
scaler = StandardScaler(inputCol="features",
                        outputCol="features_scaled",
                        withMean=True,
                        withStd=True)

In [98]:
normalized = scaler.fit(df_assembled).transform(df_assembled)
normalized.show(5)

+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-----------+-------------------+-------------------------+--------------------------+-------------------------+---------------+--------------+-----------------------------+--------------------+--------------------+
|person_age|person_income|person_home_ownership|person_emp_length|loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_status|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|num_person_home_ownership|num_loan_intent|num_loan_grade|num_cb_person_default_on_file|            features|     features_scaled|
+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-----------+-------------------+-------------------------+--------------------------+-------------------------+---------------+--------------+-----------------------------+--------------------+--------------------+
|      22.0|    

In [99]:
train, test = normalized.randomSplit([0.8, 0.2])

In [103]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, featuresCol = "features_scaled", labelCol = "num_loan_grade")
lr_model = lr.fit(train)

In [117]:
#predicciones del modelo
predict = lr_model.transform(test)

In [125]:
predict.select("prediction")

DataFrame[prediction: double]

In [127]:
#evaluacion del modelo
eval = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="num_loan_grade")  
predict.select("num_loan_grade", "prediction")  
AUC = eval.evaluate(predict)
print(f"AUC score is: {AUC}")

AUC score is: 0.8889600409836065
