<a href="https://colab.research.google.com/github/pratikesh3232/Pyspark_intro/blob/main/pyspark_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

In [19]:
spk  = SparkSession.builder.appName('test').getOrCreate()

In [20]:
df = spk.read.csv('/content/sample_data/credit_data.csv', header=True, inferSchema=True)

In [21]:
df.show()

+---+------+------+------------+---------------------+------------------------+-----------+-----------+-----------------+-------------+------------------+------------------+------------------+-------------+-----------------+
|Age|Gender|Income|Credit Score|Credit History Length|Number of Existing Loans|Loan Amount|Loan Tenure|Existing Customer|        State|              City|         LTV Ratio|Employment Profile|Profile Score|       Occupation|
+---+------+------+------------+---------------------+------------------------+-----------+-----------+-----------------+-------------+------------------+------------------+------------------+-------------+-----------------+
| 31|  Male| 36000|         604|                  487|                       5|     109373|        221|               No|    Karnataka|            Mysuru| 90.94342996168837|          Salaried|           77|           Doctor|
| 25|  Male| 50000|         447|                  386|                       2|     150000|         

In [22]:
df.columns

['Age',
 'Gender',
 'Income',
 'Credit Score',
 'Credit History Length',
 'Number of Existing Loans',
 'Loan Amount',
 'Loan Tenure',
 'Existing Customer',
 'State',
 'City',
 'LTV Ratio',
 'Employment Profile',
 'Profile Score',
 'Occupation']

In [23]:
x = VectorAssembler(inputCols=['Age',
 'Income',
 'Credit History Length',
 'Number of Existing Loans',
 'Loan Amount',
 'Loan Tenure',
 'LTV Ratio',
 'Profile Score',
],outputCol="X")

In [24]:
df1 = x.transform(df)

In [25]:
df1.show()

+---+------+------+------------+---------------------+------------------------+-----------+-----------+-----------------+-------------+------------------+------------------+------------------+-------------+-----------------+--------------------+
|Age|Gender|Income|Credit Score|Credit History Length|Number of Existing Loans|Loan Amount|Loan Tenure|Existing Customer|        State|              City|         LTV Ratio|Employment Profile|Profile Score|       Occupation|                   X|
+---+------+------+------------+---------------------+------------------------+-----------+-----------+-----------------+-------------+------------------+------------------+------------------+-------------+-----------------+--------------------+
| 31|  Male| 36000|         604|                  487|                       5|     109373|        221|               No|    Karnataka|            Mysuru| 90.94342996168837|          Salaried|           77|           Doctor|[31.0,36000.0,487...|
| 25|  Male| 500

In [26]:
df_final = df1.select(['X','Credit Score'])

In [27]:
df_final.show()

+--------------------+------------+
|                   X|Credit Score|
+--------------------+------------+
|[31.0,36000.0,487...|         604|
|[25.0,50000.0,386...|         447|
|[62.0,178000.0,50...|         850|
|[69.0,46000.0,349...|         668|
|[52.0,132000.0,55...|         601|
|[64.0,127000.0,15...|         850|
|[29.0,15000.0,89....|         378|
|[30.0,82000.0,610...|         424|
|[52.0,119000.0,27...|         753|
|[39.0,101000.0,42...|         575|
|[69.0,166000.0,38...|         384|
|[64.0,167000.0,43...|         850|
|[68.0,88000.0,186...|         554|
|[19.0,25000.0,227...|         350|
|[59.0,60000.0,179...|         562|
|[26.0,53000.0,18....|         547|
|[57.0,36000.0,409...|         821|
|[42.0,115000.0,30...|         502|
|[34.0,44000.0,441...|         300|
|[20.0,57000.0,11....|         733|
+--------------------+------------+
only showing top 20 rows



In [28]:
from pyspark.ml.regression import LinearRegression

In [29]:
# train Test Split
train_data, test_data = df_final.randomSplit([0.7,0.3])

In [30]:
rg = LinearRegression(featuresCol='X', labelCol='Credit Score')

In [31]:
rg = rg.fit(train_data)

In [32]:
# Pred
pred = rg.evaluate(test_data)

In [33]:
pred.predictions.show()

+--------------------+------------+------------------+
|                   X|Credit Score|        prediction|
+--------------------+------------+------------------+
|[18.0,9000.0,18.0...|         515| 495.0072725082209|
|[18.0,9000.0,90.0...|         760| 762.5185989824898|
|[18.0,9000.0,91.0...|         533| 542.0439161517799|
|[18.0,9000.0,106....|         763| 761.7194319193636|
|[18.0,9000.0,119....|         541| 535.4990994094455|
|[18.0,9000.0,213....|         375| 376.1405615099224|
|[18.0,9000.0,222....|         747| 765.3921792308938|
|[18.0,9000.0,235....|         328| 331.5784352226746|
|[18.0,9000.0,340....|         610| 598.1745734989352|
|[18.0,9000.0,350....|         341| 330.3912854482922|
|[18.0,9000.0,350....|         341| 330.3912854482922|
|[18.0,9000.0,400....|         658| 657.7567791483715|
|[18.0,9000.0,470....|         364|378.90969855508627|
|[18.0,9000.0,498....|         583| 602.9500346640893|
|[18.0,9000.0,500....|         300| 321.1141358721219|
|[18.0,900

In [36]:
pred.meanAbsoluteError, pred.meanSquaredError

(14.114069209488255, 268.6080259418669)