<a href="https://colab.research.google.com/github/stevejj4/Insurance-data-lifecycle/blob/main/Interactions_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install google-cloud-bigquery
!pip install pyspark==3.1.2

from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
import pandas as pd
from pyspark.sql import SparkSession




In [4]:
# Authenticating and initializing BigQuery client
project_id = 'river-messenger-430112-e1'
client = bigquery.Client(project=project_id)

In [5]:
# Querying to get interactions table
query_interactions = """
SELECT * FROM `river-messenger-430112-e1.Insurance_data.interactions`;
"""

In [6]:
# Executing the query and converting to a pandas DataFrame
df_interactions = client.query(query_interactions).to_dataframe()

In [7]:
# Displaying the first few rows of the pandas DataFrame
#print(df_interactions.head())

In [8]:
# Initializing SparkSession
spark = SparkSession.builder \
    .appName('BigQuerySparkApp') \
    .getOrCreate()

# Converting pandas DataFrame to Spark DataFrame
# Using iterrows() instead of iteritems() to iterate over DataFrame rows
spark_df_interactions = spark.createDataFrame(df_interactions.to_dict('records'))

# schema and first few rows
spark_df_interactions.printSchema()
#spark_df_interactions.show(5)

root
 |-- CustomerID: long (nullable = true)
 |-- InteractionDate: string (nullable = true)
 |-- InteractionID: long (nullable = true)
 |-- InteractionOutcome: string (nullable = true)
 |-- InteractionType: string (nullable = true)



In [9]:
from pyspark.sql.functions import col, when

# Handling missing values
spark_df_interactions = spark_df_interactions.fillna({'InteractionOutcome': 'not resolved'})

# Display the updated DataFrame
spark_df_interactions.show(5)


+----------+---------------+-------------+------------------+---------------+
|CustomerID|InteractionDate|InteractionID|InteractionOutcome|InteractionType|
+----------+---------------+-------------+------------------+---------------+
|      1599|     2023-07-19|          518|        Unresolved|           Call|
|       488|     2023-07-19|          678|          Resolved|           Call|
|      1942|     2023-07-19|          717|          Resolved|           Call|
|      1065|     2023-07-19|          822|        Unresolved|           Call|
|       988|     2023-07-19|         3839|        Unresolved|           Call|
+----------+---------------+-------------+------------------+---------------+
only showing top 5 rows



In [15]:
# Creating a satisfaction score feature
spark_df_interactions = spark_df_interactions.withColumn(
    'SatisfactionScore', when(col('InteractionOutcome') == 'Resolved', 1).otherwise(0)
)
# Display the updated DataFrame with new features
spark_df_interactions.show(5)


+----------+---------------+-------------+------------------+---------------+-----------------+----+-----+---+
|CustomerID|InteractionDate|InteractionID|InteractionOutcome|InteractionType|SatisfactionScore|Year|Month|Day|
+----------+---------------+-------------+------------------+---------------+-----------------+----+-----+---+
|      1599|     2023-07-19|          518|        Unresolved|           Call|                0|2023|    7| 19|
|       488|     2023-07-19|          678|          Resolved|           Call|                1|2023|    7| 19|
|      1942|     2023-07-19|          717|          Resolved|           Call|                1|2023|    7| 19|
|      1065|     2023-07-19|          822|        Unresolved|           Call|                0|2023|    7| 19|
|       988|     2023-07-19|         3839|        Unresolved|           Call|                0|2023|    7| 19|
+----------+---------------+-------------+------------------+---------------+-----------------+----+-----+---+
o

In [17]:
satisfaction_distribution = spark_df_interactions.groupBy('SatisfactionScore').count()
satisfaction_distribution.show()
#Distribution of satisfaction score within each interaction type
satisfaction_by_type = spark_df_interactions.groupBy('InteractionType', 'SatisfactionScore').count()
satisfaction_by_type.show()

+-----------------+-----+
|SatisfactionScore|count|
+-----------------+-----+
|                1| 2503|
|                0| 2497|
+-----------------+-----+

+---------------+-----------------+-----+
|InteractionType|SatisfactionScore|count|
+---------------+-----------------+-----+
|          Email|                1|  848|
|           Call|                0|  854|
|          Visit|                0|  814|
|          Visit|                1|  867|
|           Call|                1|  788|
|          Email|                0|  829|
+---------------+-----------------+-----+



In [None]:
# Extract year, month, day from InteractionDate
from pyspark.sql.functions import year, month, dayofmonth

spark_df_interactions = spark_df_interactions.withColumn('Year', year(col('InteractionDate')))
spark_df_interactions = spark_df_interactions.withColumn('Month', month(col('InteractionDate')))
spark_df_interactions = spark_df_interactions.withColumn('Day', dayofmonth(col('InteractionDate')))

In [11]:
# Number of interactions per day
interactions_per_day = spark_df_interactions.groupBy('Year', 'Month', 'Day').count()
interactions_per_day.show(5)

# Number of resolved and not resolved interactions
interaction_outcomes = spark_df_interactions.groupBy('InteractionOutcome').count()
interaction_outcomes.show(5)

# Number of interactions per interaction type
interactions_per_type = spark_df_interactions.groupBy('InteractionType').count()
interactions_per_type.show(5)

# Number of interactions per customer
interactions_per_customer = spark_df_interactions.groupBy('CustomerID').count()
interactions_per_customer.show(5)


+----+-----+---+-----+
|Year|Month|Day|count|
+----+-----+---+-----+
|2024|    3| 19|   13|
|2024|    2| 10|   11|
|2024|    1| 13|   14|
|2023|    9|  4|   20|
|2023|    8| 10|   14|
+----+-----+---+-----+
only showing top 5 rows

+------------------+-----+
|InteractionOutcome|count|
+------------------+-----+
|        Unresolved| 2497|
|          Resolved| 2503|
+------------------+-----+

+---------------+-----+
|InteractionType|count|
+---------------+-----+
|           Call| 1642|
|          Email| 1677|
|          Visit| 1681|
+---------------+-----+

+----------+-----+
|CustomerID|count|
+----------+-----+
|        26|    3|
|        29|    2|
|      1950|    3|
|      1806|    4|
|       964|    1|
+----------+-----+
only showing top 5 rows



In [12]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot interaction outcomes
plt.figure(figsize=(10, 6))
sns.barplot(x='InteractionOutcome', y='count', data=pdf_interaction_outcomes)
plt.title('Number of Interactions by Outcome')
plt.xlabel('Interaction Outcome')
plt.ylabel('Count')
plt.show()


NameError: name 'pdf_interaction_outcomes' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Converting Spark DataFrame to Pandas DataFrame for processing with PyTorch
df_interactions = spark_df_interactions.toPandas()

# Selecting features and target
features = df_interactions[['InteractionType', 'Year', 'Month', 'Day']]
target = df_interactions['SatisfactionScore']

# One-hot encode categorical features and standardize numerical features
features = pd.get_dummies(features, columns=['InteractionType'])
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Spliting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Converting data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

# Defining the model architecture
class InteractionModel(nn.Module):
    def __init__(self):
        super(InteractionModel, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 50)
        self.fc2 = nn.Linear(50, 10)
        self.fc3 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# Initializing the model, loss function, and optimizer
model = InteractionModel()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train.unsqueeze(1))
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    accuracy = ((predictions.squeeze() > 0.5) == y_test).float().mean()
    print(f'Accuracy: {accuracy.item() * 100:.2f}%')
