In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
spark = SparkSession.builder.master("local[2]").appName("Bank subscription").getOrCreate()

In [None]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

In [None]:
train = spark.read.csv('/kaggle/input/banking-dataset-marketing-targets/train.csv',sep=";", header = True, inferSchema=True)
test =  spark.read.csv('/kaggle/input/banking-dataset-marketing-targets/test.csv',sep=";",  header = True, inferSchema=True)

In [None]:
train.show()

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCols=["job","marital","education",'default','housing','loan','contact','poutcome',"y"], outputCols = ["job_E","marital_E","education_E",'default_E','housing_E','loan_E','contact_E','poutcome_E',"Y"]) 
indexer = indexer.fit(train)
indexed = indexer.transform(train) 
test = indexer.transform(test)
indexed.show()

In [None]:
drop_list = ["job","marital","education",'default','housing','loan','contact','month','poutcome',"y"]
train = indexed.select([column for column in indexed.columns if column not in drop_list])
test = test.select([column for column in test.columns if column not in drop_list])

In [None]:
train.describe().toPandas()

In [None]:
test.describe().toPandas()

In [None]:
train.printSchema()

In [None]:
def count_missings(spark_df,sort=True):
    """
    Counts number of nulls and nans in each column
    """
    df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

In [None]:
count_missings(train)

In [None]:
count_missings(test)

We can clearly observe that both our training and test set doesn't have any form of missing values

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, DecisionTreeClassifier,  LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [None]:
assembler = VectorAssembler(inputCols=train.columns , outputCol="features") 
train = assembler.transform(train)
test = assembler.transform(test)

In [None]:
train.select("Y").show()

In [None]:
from pyspark.ml.feature import PCA as PCAml

pca = PCAml(k=4, inputCol="features", outputCol="pca")
model = pca.fit(train)
train = model.transform(train)
test = model.transform(test)

In [None]:
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

In [None]:
scaler = standardScaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [None]:
metrics = {}

In [None]:
rain_forest = RandomForestClassifier(labelCol="Y", featuresCol="features_scaled")
rain_forest_model = rain_forest.fit(train)

In [None]:
svc = LinearSVC(labelCol="Y", featuresCol="features_scaled")
svc_model = svc.fit(train)


In [None]:
tree = DecisionTreeClassifier(labelCol="Y", featuresCol="features_scaled")
tree_model = tree.fit(train)


In [None]:
lr = LogisticRegression(labelCol="Y", featuresCol="features_scaled")
lr_model = lr.fit(train)
