<a href="https://colab.research.google.com/github/stevejj4/Insurance-data-lifecycle/blob/main/Machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install google-cloud-bigquery
!pip install pyspark==3.1.2

from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
import pandas as pd
from pyspark.sql import SparkSession




In [26]:
# Authenticating and initializing BigQuery client
project_id = 'river-messenger-430112-e1'
client = bigquery.Client(project=project_id)

In [27]:
# Querying to get interactions table, specifying the location
query_ML_Team = """
SELECT * FROM `river-messenger-430112-e1.Insurance_data.ML_team`
"""


In [28]:
# Executing the query and converting to a pandas DataFrame
df_ml = client.query(query_ML_Team).to_dataframe()

In [29]:
# Initializing SparkSession
spark = SparkSession.builder \
    .appName('BigQuerySparkApp') \
    .getOrCreate()


In [30]:
# Combined query to join customers, policies, claims, and ML_team tables
query = """
SELECT
    c.CustomerID,
    c.Age,
    p.PolicyType,
    cl.ClaimAmount,
    cl.ClaimStatus,
    mt.DurationIndex,
    mt.PolicyTypeIndex,
    mt.PremiumAmount
FROM
    `river-messenger-430112-e1.Insurance_data.customers` c
JOIN
    `river-messenger-430112-e1.Insurance_data.policies` p ON c.CustomerID = p.CustomerID
JOIN
    `river-messenger-430112-e1.Insurance_data.claims` cl ON c.CustomerID = cl.CustomerID
JOIN
    `river-messenger-430112-e1.Insurance_data.ML_team` mt ON c.CustomerID = mt.CustomerID;
"""

# Execute the query and convert to a pandas DataFrame
df = client.query(query).to_dataframe()

# Display the first few rows
df.head()


Unnamed: 0,CustomerID,Age,PolicyType,ClaimAmount,ClaimStatus,DurationIndex,PolicyTypeIndex,PremiumAmount
0,3,53,Auto,10903.28,Pending,0,3,3644.16
1,9,43,Auto,8817.71,Rejected,1,3,4010.11
2,9,43,Auto,18213.56,Rejected,1,3,4010.11
3,9,43,Auto,1700.61,Rejected,1,3,4010.11
4,24,39,Auto,4513.44,Pending,0,3,3700.61


In [31]:
# dropping policyType and Customer ID
df = df.drop(columns=['PolicyType', 'CustomerID'])

In [32]:
# If using PySpark, convert the pandas DataFrame to a Spark DataFrame
# Converting pandas DataFrame to Spark DataFrame (Should be row wise)
# Using iterrows() instead of iteritems() to iterate over DataFrame rows
# iteritems() used for column wise data
# orient parameters for flexibility
spark_df = spark.createDataFrame(df.to_dict('records'))
# Show the schema and first few rows
spark_df.printSchema()
spark_df.show(5)

root
 |-- Age: long (nullable = true)
 |-- ClaimAmount: double (nullable = true)
 |-- ClaimStatus: string (nullable = true)
 |-- DurationIndex: long (nullable = true)
 |-- PolicyTypeIndex: long (nullable = true)
 |-- PremiumAmount: double (nullable = true)

+---+-----------+-----------+-------------+---------------+-------------+
|Age|ClaimAmount|ClaimStatus|DurationIndex|PolicyTypeIndex|PremiumAmount|
+---+-----------+-----------+-------------+---------------+-------------+
| 53|   10903.28|    Pending|            0|              3|      3644.16|
| 43|    8817.71|   Rejected|            1|              3|      4010.11|
| 43|   18213.56|   Rejected|            1|              3|      4010.11|
| 43|    1700.61|   Rejected|            1|              3|      4010.11|
| 39|    4513.44|    Pending|            0|              3|      3700.61|
+---+-----------+-----------+-------------+---------------+-------------+
only showing top 5 rows



In [33]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Prepare the features and label for ML
feature_columns = ['Age', 'ClaimAmount', 'DurationIndex', 'PolicyTypeIndex', 'PremiumAmount']
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Ensure ClaimAmount is treated as a label (if needed)
data_ml = vector_assembler.transform(spark_df)

# Split the data into training and test sets
train_data, test_data = data_ml.randomSplit([0.7, 0.3], seed=42)

# Train a Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='ClaimAmount')
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol='ClaimAmount', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse}")

# Show some predictions
predictions.select('ClaimAmount', 'prediction').show(5)


Root Mean Squared Error (RMSE) on test data: 5.4880280285650015e-12
+-----------+------------------+
|ClaimAmount|        prediction|
+-----------+------------------+
|    2339.63|2339.6299999999915|
|    5098.49| 5098.489999999989|
|    6370.89| 6370.889999999989|
|    6785.17|  6785.16999999999|
|    9174.48| 9174.479999999987|
+-----------+------------------+
only showing top 5 rows



In [34]:
from pyspark.sql.functions import abs
from pyspark.sql.functions import abs, when, col # Impor

# Define a tolerance for correctness
tolerance = 100  # This is an example value; adjust as needed

# Add a column that represents the absolute error
predictions = predictions.withColumn('absolute_error', abs(col('ClaimAmount') - col('prediction')))

# Add a column that indicates whether the prediction is within the tolerance
predictions = predictions.withColumn('is_correct', when(col('absolute_error') <= tolerance, 1).otherwise(0))

# Count the number of correct predictions
correct_predictions_count = predictions.agg({'is_correct': 'sum'}).collect()[0][0]
total_predictions_count = predictions.count()

print(f"Number of correct predictions: {correct_predictions_count}")
print(f"Total number of predictions: {total_predictions_count}")
print(f"Accuracy (percentage): {100 * correct_predictions_count / total_predictions_count:.2f}%")


Number of correct predictions: 860
Total number of predictions: 860
Accuracy (percentage): 100.00%
