In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, DoubleType
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("xor") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '1') \
    .config("spark.driver.memory",'1g') \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
data_schema = StructType([
    StructField('Date',StringType(), False),
    StructField('Customer_Code',DoubleType(), False),
    StructField('Gender',IntegerType(), False),
    StructField('Foreigner_Index',IntegerType(), False),
    StructField('Channel',StringType(), False),
    StructField('Province_Name',StringType(), False),
    StructField('Active',IntegerType(), False),
    StructField('Segmentation',IntegerType(), False),
    StructField('Savings_Account',IntegerType(), False),
    StructField('Guarantees',IntegerType(), False),
    StructField('Current_Accounts',IntegerType(), False),
    StructField('Derivative',IntegerType(), False),
    StructField('Payroll_Account',IntegerType(), False),
    StructField('Junior_Account',IntegerType(), False),
    StructField('More_Particular_Account',IntegerType(), False),
    StructField('Particular_Account',IntegerType(), False),
    StructField('Particular_Plus_Account',IntegerType(), False),
    StructField('Short_Term_Deposits',IntegerType(), False),
    StructField('Medium_Term_Deposits',IntegerType(), False),
    StructField('Long_Term_Deposits',IntegerType(), False),
    StructField('e-Account',IntegerType(), False),
    StructField('Funds',IntegerType(), False),
    StructField('Mortgage',IntegerType(), False),
    StructField('Pensions',IntegerType(), False),
    StructField('Loans',IntegerType(), False),
    StructField('Taxes',IntegerType(), False),
    StructField('Credit_Card',IntegerType(), False),
    StructField('Securities',IntegerType(), False),
    StructField('Home_Account',IntegerType(), False),
    StructField('Payroll',IntegerType(), False),
    StructField('Pensions_two',IntegerType(), False),
    StructField('Direct_Debit',IntegerType(), False),
    StructField('Age_Range',IntegerType(), False),
    StructField('Months_Range',IntegerType(), False),
    StructField('Income_Range',IntegerType(), False)
])

In [4]:
df = spark.read.csv(
    'santander_df_clean.csv', header=True, schema=data_schema
).cache()

## Preprocessing

## Modeling

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [15]:
df.columns

['Date',
 'Customer_Code',
 'Gender',
 'Months_at_Bank',
 'Foreigner_Index',
 'Channel',
 'Province_Name',
 'Active',
 'Segmentation',
 'Savings_Account',
 'Guarantees',
 'Current_Accounts',
 'Derivative',
 'Payroll_Account',
 'Junior_Account',
 'More_Particular_Account',
 'Particular_Account',
 'Particular_Plus_Account',
 'Short_Term_Deposits',
 'Medium_Term_Deposits',
 'Long_Term_Deposits',
 'e-Account',
 'Funds',
 'Mortgage',
 'Pensions',
 'Loans',
 'Taxes',
 'Credit_Card',
 'Securities',
 'Home_Account',
 'Payroll',
 'Pensions_two',
 'Direct_Debit',
 'Age_Range',
 'Income_Range']

In [6]:
ratings = (df
    .select(
     'Customer_Code',
     'Savings_Account',
     'Guarantees',
     'Current_Accounts',
     'Derivative',
     'Payroll_Account',
     'Junior_Account',
     'More_Particular_Account',
     'Particular_Account',
     'Particular_Plus_Account',
     'Short_Term_Deposits',
     'Medium_Term_Deposits',
     'Long_Term_Deposits',
     'e-Account',
     'Funds',
     'Mortgage',
     'Pensions',
     'Loans',
     'Taxes',
     'Credit_Card',
     'Securities',
     'Home_Account',
     'Payroll',
     'Pensions_two',
     'Direct_Debit'
    )
)

In [5]:
currentdf = (df.select('Customer_Code','Current_Accounts'))
payrolldf = (df.select('Customer_Code','Payroll_Account'))
nextdf = currentdf.union(payrolldf)
nextdf.show(3)

+-------------+----------------+
|Customer_Code|Current_Accounts|
+-------------+----------------+
|      15930.0|               1|
|      15930.0|               1|
|      15930.0|               1|
+-------------+----------------+
only showing top 3 rows



In [None]:
ratings =

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [None]:
als = ALS(maxIter=5, regParam=0.01, userCol="Customer_Code", itemCol="Item", ratingCol="Rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)