In [1]:
## usar python 3.8.8
import findspark
findspark.init()

In [2]:
# Importando SparkSession para criar uma sessão do Spark
from pyspark.sql import SparkSession

# Importando funções e tipos de dados SparkSQL
from pyspark.sql import functions as f
from pyspark.sql.types import *

# Importando módulos Spark MLlib
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Importando SparkContext e SparkConf
from pyspark import SparkContext, SparkConf


In [3]:

# Criando uma nova sessão do Spark

# Spark entry point
spark = SparkSession \
    .builder \
    .appName("JOINS-pkdd99-xpeMBA") \
    .getOrCreate()

spark.version


'3.0.0'

Ver o diagrama entidade relacional em: ''<''link do git''>''

In [11]:

def read_df_csv(tabela=str):
    """
    Função para as bases de dados onde retorna no print o 'shape', um breve 'show' e o Scheema das variáveis.
    :param entidade_name: string que referencie o nome da tabela que complete o caminho './dados_originais/{tabela}.csv'. 
    tabela pode ser => account, card, client, disp, district, loan, order 
    :return: DataFrame em pyspark
    """
    path ="C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_tratados"
    df = spark.read.csv(path = f'{path}/{tabela}.csv', header='True',inferSchema='False', sep=';')
    print('\n','A base de dados possui:',df.count(), 'linhas', 'e', len(df.columns), 'colunas', '\n')
    print(df.show(5))
    print(df.printSchema())
    return(df)

In [12]:
account_df = read_df_csv('account')


 A base de dados possui: 4500 linhas e 4 colunas 

+----------+-----------+--------+----------+
|account_id|district_id|stmt_frq|      date|
+----------+-----------+--------+----------+
|       576|         55| monthly|1993-01-01|
|      3818|         74| monthly|1993-01-01|
|       704|         55| monthly|1993-01-01|
|      2378|         16| monthly|1993-01-01|
|      2632|         24| monthly|1993-01-02|
+----------+-----------+--------+----------+
only showing top 5 rows

None
root
 |-- account_id: string (nullable = true)
 |-- district_id: string (nullable = true)
 |-- stmt_frq: string (nullable = true)
 |-- date: string (nullable = true)

None


In [13]:
card_df = read_df_csv('card')


 A base de dados possui: 892 linhas e 4 colunas 

+-------+-------+-------+----------+
|card_id|disp_id|   type|      date|
+-------+-------+-------+----------+
|   1005|   9285|classic|1993-11-07|
|    104|    588|classic|1994-01-19|
|    747|   4915|classic|1994-02-05|
|     70|    439|classic|1994-02-08|
|    577|   3687|classic|1994-02-15|
+-------+-------+-------+----------+
only showing top 5 rows

None
root
 |-- card_id: string (nullable = true)
 |-- disp_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- date: string (nullable = true)

None


In [14]:
client_df = read_df_csv('client')


 A base de dados possui: 5369 linhas e 4 colunas 

+---------+----------+-----------+------+
|client_id|date_birth|district_id|gender|
+---------+----------+-----------+------+
|        1|1970-12-13|         18|     F|
|        2|1945-02-04|          1|     M|
|        3|1940-10-09|          1|     F|
|        4|1956-12-01|          5|     M|
|        5|1960-07-03|          5|     F|
+---------+----------+-----------+------+
only showing top 5 rows

None
root
 |-- client_id: string (nullable = true)
 |-- date_birth: string (nullable = true)
 |-- district_id: string (nullable = true)
 |-- gender: string (nullable = true)

None


In [18]:
order_df = read_df_csv('order') 


 A base de dados possui: 6471 linhas e 6 colunas 

+--------+----------+-------+----------+------+---------+
|order_id|account_id|bank_to|account_to|amount| category|
+--------+----------+-------+----------+------+---------+
|   29401|         1|     YZ|  87144583|2452.0|household|
|   29402|         2|     ST|  89597016|3372.7|loan_payt|
|   29403|         2|     QR|  13943797|7266.0|household|
|   29404|         3|     WX|  83084338|1135.0|household|
|   29405|         3|     CD|  24485939| 327.0|         |
+--------+----------+-------+----------+------+---------+
only showing top 5 rows

None
root
 |-- order_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- bank_to: string (nullable = true)
 |-- account_to: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- category: string (nullable = true)

None


In [17]:
loan_df = read_df_csv('loan')


 A base de dados possui: 682 linhas e 7 colunas 

+-------+----------+----------+------+--------+--------+------+
|loan_id|account_id|      date|amount|duration|payments|status|
+-------+----------+----------+------+--------+--------+------+
|   5314|      1787|1993-07-05| 96396|      12|  8033.0|     B|
|   5316|      1801|1993-07-11|165960|      36|  4610.0|     A|
|   6863|      9188|1993-07-28|127080|      60|  2118.0|     A|
|   5325|      1843|1993-08-03|105804|      36|  2939.0|     A|
|   7240|     11013|1993-09-06|274740|      60|  4579.0|     A|
+-------+----------+----------+------+--------+--------+------+
only showing top 5 rows

None
root
 |-- loan_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- payments: string (nullable = true)
 |-- status: string (nullable = true)

None


In [16]:
district_df = read_df_csv('district')


 A base de dados possui: 77 linhas e 16 colunas 

+-----------+-----------+---------------+-------+------+-----+------+------+----+-----+------+------+------+-------+------+------+
|district_id|      dname|         region|    pop|nmu500|nmu2k|nmu10k|nmuinf|ncit|rurba|avgsal|urat95|urat96|ent_ppt|ncri95|ncri96|
+-----------+-----------+---------------+-------+------+-----+------+------+----+-----+------+------+------+-------+------+------+
|          1|Hl.m. Praha|         Prague|1204953|     0|    0|     0|     1|   1|100.0| 12541|  0.29|  0.43|    167| 85677| 99107|
|          2|    Benesov|central Bohemia|  88884|    80|   26|     6|     2|   5| 46.7|  8507|  1.67|  1.85|    132|  2159|  2674|
|          3|     Beroun|central Bohemia|  75232|    55|   26|     4|     1|   5| 41.7|  8980|  1.95|  2.21|    111|  2824|  2813|
|          4|     Kladno|central Bohemia| 149893|    63|   29|     6|     2|   6| 67.4|  9753|  4.64|  5.05|    109|  5244|  5892|
|          5|      Kolin|central

In [15]:
disp_df = read_df_csv('disp')


 A base de dados possui: 5369 linhas e 4 colunas 

+-------+---------+----------+---------+
|disp_id|client_id|account_id|     type|
+-------+---------+----------+---------+
|      1|        1|         1|    owner|
|      2|        2|         2|    owner|
|      3|        3|         2|disponent|
|      4|        4|         3|    owner|
|      5|        5|         3|disponent|
+-------+---------+----------+---------+
only showing top 5 rows

None
root
 |-- disp_id: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- type: string (nullable = true)

None


## Join das tabelas 

In [35]:
features = account_df.join(disp_df, on='account_id', how='outer').withColumnRenamed('account_id', 'account_id_acct')
features.count()

5369

In [36]:
features.columns

['account_id_acct',
 'district_id',
 'stmt_frq',
 'date',
 'disp_id',
 'client_id',
 'type']

In [37]:
cond = (loan_df.account_id == features.account_id_acct)
features = (
    features
    .join(loan_df, on = cond, how='left')
)

features.count()

5369

In [38]:
features.columns

['account_id_acct',
 'district_id',
 'stmt_frq',
 'date',
 'disp_id',
 'client_id',
 'type',
 'loan_id',
 'account_id',
 'date',
 'amount',
 'duration',
 'payments',
 'status']

In [None]:
# Do SQL-style merge of data from different dataframes to create features dataframe for classification
features = pd.merge(account,disp,on='account_id',how='outer')
features = pd.merge(features,loan,on='account_id',how='left',suffixes=('_acct','_loan')) # sufficies for date
features = pd.merge(features,client,on='client_id',how='outer',suffixes=('_bank','_client')) # suffices for district_id
features = pd.merge(features,card,on='disp_id',how='outer',suffixes=('_disp','_card')) # suffices for type
features.rename(columns = {'date':'date_card'}, inplace=True) # for clarity
print(len(features),'total feature records, ie one for each client')  # should be 5369
features = features[pd.notnull(features['loan_id'])]
print(len(features),'feature records with a loan; some accts repeated due to multiple clients on same acct')  # should be 827
print(len(features['account_id'].unique()),'feature records with a loan and unique account_id')  # should be 682