<a href="https://colab.research.google.com/github/stevejj4/Insurance-data-lifecycle/blob/main/Interactions_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
# Install required packages
!pip install pyspark==3.1.2

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, year, month, dayofmonth, udf
from pyspark.sql.types import StringType
from calendar import month_name
import pandas as pd

# Initialize SparkSession
spark = SparkSession.builder \
    .appName('DataProcessing') \
    .getOrCreate()

# Load the data from the CSV file
df_interactions = pd.read_csv('interactions.csv')
spark_df_interactions = spark.createDataFrame(df_interactions.to_dict('records'))

# Handle missing values
spark_df_interactions = spark_df_interactions.fillna({'InteractionOutcome': 'not resolved'})

# Create a satisfaction score feature
spark_df_interactions = spark_df_interactions.withColumn(
    'SatisfactionScore', when(col('InteractionOutcome') == 'Resolved', 1).otherwise(0)
)

# Extract year, month, day from InteractionDate
spark_df_interactions = spark_df_interactions.withColumn('Year', year(col('InteractionDate')))
spark_df_interactions = spark_df_interactions = spark_df_interactions.withColumn('Month', month(col('InteractionDate')))
spark_df_interactions = spark_df_interactions.withColumn('Day', dayofmonth(col('InteractionDate')))

# Create a UDF to convert month number to month name
month_to_name_udf = udf(lambda x: month_name[x], StringType())
spark_df_interactions = spark_df_interactions.withColumn('MonthName', month_to_name_udf(col('Month')))

# Drop InteractionDate, InteractionOutcome, and rename InteractionType to InteractionChannel
spark_df_interactions = spark_df_interactions.drop('InteractionDate', 'InteractionOutcome')
spark_df_interactions = spark_df_interactions.withColumnRenamed('InteractionType', 'InteractionChannel')
spark_df_interactions = spark_df_interactions.drop('Month')




In [69]:
# Install required packages
!pip install google-cloud-bigquery google-cloud-bigquery-storage

from google.colab import auth
from google.cloud import bigquery
import pandas as pd

# Authenticate and initialize BigQuery client
auth.authenticate_user()

# Initialize BigQuery client
project_id = 'river-messenger-430112-e1'
client = bigquery.Client(project=project_id)

def upload_to_bigquery(df, table_name):
    dataset_id = 'Insurance_data'
    table_ref = bigquery.TableReference(
        bigquery.DatasetReference(project_id, dataset_id), table_name
    )

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    )

    job = client.load_table_from_dataframe(
        df, table_ref, job_config=job_config
    )

    job.result()  # Wait for the job to complete

    print(f"Loaded {job.output_rows} rows into {table_ref}.")

# Assuming 'spark_df_interactions' from your previous code is the processed DataFrame
upload_to_bigquery(spark_df_interactions.toPandas(), 'Processed_interactions') # Convert the Spark DataFrame to a Pandas DataFrame before uploading

Loaded 5000 rows into river-messenger-430112-e1.Insurance_data.Processed_interactions.
