In [0]:
%sh
ls /team5/data

In [1]:
%spark
// Number of cores available for the driver
println(s"Driver cores: ${sc.defaultParallelism}")

// Number of executors and their cores
val executorCores = spark.conf.get("spark.executor.cores", "1").toInt
val numExecutors = spark.conf.get("spark.executor.instances", "1").toInt
println(s"Executors: $numExecutors")
println(s"Executor cores: $executorCores")
println(s"Total available cores: ${numExecutors * executorCores}")


In [2]:
%python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandasql as ps
import sqlite3


df = pd.read_csv('/team5/data/w_df.csv', delimiter=',', low_memory=False)  

In [3]:
%python
import os
import multiprocessing

# Number of physical cores
physical_cores = os.cpu_count() // 2  # Approximation for physical cores
print(f"Number of Physical Cores: {physical_cores}")

# Total cores (physical + logical with hyperthreading)
total_cores = multiprocessing.cpu_count()
print(f"Number of Logical Cores: {total_cores}")


In [4]:
%python
print(f"Driver cores: {sc.defaultParallelism}")


In [5]:
%python
df.shape

In [6]:
# Summary statistics
print(df[['Prime', 'Sinistre']].describe())

# Plot distributions
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(df['Prime'], bins=50, kde=True, color='blue')
plt.title('Distribution of Prime')
plt.xlabel('Prime')

plt.subplot(1, 2, 2)
sns.histplot(df['Sinistre'], bins=50, kde=True, color='green')
plt.title('Distribution of Sinistre')
plt.xlabel('Sinistre')

plt.tight_layout()
plt.show()


In [7]:
# Count total rows
total_count = df.shape[0]

# Prime Categories
negative_prime = (df['Prime'] < 0).sum()
zero_prime = (df['Prime'] == 0).sum()
positive_prime = (df['Prime'] > 0).sum()

# Sinistre Categories
zero_sinistre = (df['Sinistre'] == 0).sum()
nonzero_sinistre = (df['Sinistre'] > 0).sum()

# Calculate percentages
prime_percentages = {
    'Negative Prime': negative_prime / total_count * 100,
    'Zero Prime': zero_prime / total_count * 100,
    'Positive Prime': positive_prime / total_count * 100
}

sinistre_percentages = {
    'Zero Sinistre': zero_sinistre / total_count * 100,
    'Non-Zero Sinistre': nonzero_sinistre / total_count * 100
}

print("Prime Percentages:", prime_percentages)
print("Sinistre Percentages:", sinistre_percentages)
#we can drop the negative and 0 primes(idk the meaning but eh),sinistre wise it looks good

In [8]:
sum(prime_percentages.values()), sum(sinistre_percentages.values())


In [9]:
#dropping <=0 prime
df_cleaned = df[df['Prime'] > 0]

# Check the new shape of the DataFrame
print(f"Original shape: {df.shape}")
print(f"New shape after dropping negative/zero Prime: {df_cleaned.shape}")

# Calculate updated percentages
dropped_percentage = (1 - (df_cleaned.shape[0] / df.shape[0])) * 100
print(f"Percentage of rows dropped: {dropped_percentage:.2f}%")
#perc checks out
#Original shape: (1283819, 30)
#New shape after dropping negative/zero Prime: (1243222, 30)
#Percentage of rows dropped: 3.16%

In [10]:
print(df_cleaned['Sinistre'].describe())


In [11]:
Q1 = df_cleaned['Sinistre'].quantile(0.25)
Q3 = df_cleaned['Sinistre'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Outlier bounds: {lower_bound}, {upper_bound}")

outliers = df_cleaned[(df_cleaned['Sinistre'] < lower_bound) | (df_cleaned['Sinistre'] > upper_bound)]
print(f"Number of outliers in Sinistre: {outliers.shape[0]}")
#Number of outliers in Sinistre: 69736,in theory I should investigate further to see if the outliers ought to be dropped kept or capped but eh,don't have enough knowledhe to say if they seem erronous or not so I'll just bite and say they reflect high & low risk profiles

In [12]:
print(outliers.sort_values(by='Sinistre', ascending=False).head(10))
#just to check a little,from just 10,a good chunk of the high sinistre also goes with a high prime,which might make sense (if my understanding of prime is even correct that is) it shouldnn't be discretized right?anyways it light indeed reflect high risk ckients
#still,if the data is generated,all of this is meaningless,the model will perform poorly 
#can only hope the relationships here make sense
#keep in mind the fract vs n fract and the values that make no sense,to be handled at some other time


In [13]:
#reminder:data to be cleaned further

In [14]:
# Calculate Sinistre / Prime ratio without modifying the original df
sinistre_prime_ratio = df_cleaned['Sinistre'] / df_cleaned['Prime']

# Basic statistics for Sinistre / Prime ratio
mean_ratio = sinistre_prime_ratio.mean()
median_ratio = sinistre_prime_ratio.median()
std_ratio = sinistre_prime_ratio.std()

mean_ratio, median_ratio, std_ratio


In [15]:
# Calculate the Sinistre/Prime ratio
df_cleaned['sinistre_prime_ratio'] = df_cleaned['Sinistre'] / df_cleaned['Prime']

# Basic statistics for Sinistre / Prime ratio
mean_ratio = df_cleaned['sinistre_prime_ratio'].mean()
median_ratio = df_cleaned['sinistre_prime_ratio'].median()
std_ratio = df_cleaned['sinistre_prime_ratio'].std()

# Plot the distribution using a boxplot and adding mean and median lines
plt.figure(figsize=(12, 6))

# Violin plot to show distribution
sns.violinplot(data=df_cleaned, x='sinistre_prime_ratio', color='purple', inner=None)

# Plot the mean and median lines
plt.axvline(mean_ratio, color='red', linestyle='--', label=f'Mean: {mean_ratio:.2f}')
plt.axvline(median_ratio, color='green', linestyle='--', label=f'Median: {median_ratio:.2f}')


# Customize plot
plt.title('Distribution of Sinistre/Prime Ratio')
plt.xlabel('Sinistre / Prime')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

# Show the calculated statistics
mean_ratio, median_ratio, std_ratio


In [16]:
# Boxplot for Sinistre / Prime ratio
plt.figure(figsize=(12, 6))
sns.boxplot(x=df_cleaned['sinistre_prime_ratio'], color='purple')

# Add the mean and median lines
plt.axvline(mean_ratio, color='red', linestyle='--', label=f'Mean: {mean_ratio:.2f}')
plt.axvline(median_ratio, color='green', linestyle='--', label=f'Median: {median_ratio:.2f}')

# Customize plot
plt.title('Boxplot of Sinistre/Prime Ratio')
plt.xlabel('Sinistre / Prime')
plt.legend()
plt.grid(True)
plt.show()


In [17]:
# Scatter plot of Prime vs Sinistre, colored by the Sinistre/Prime ratio
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df_cleaned['Prime'], y=df_cleaned['Sinistre'], hue=sinistre_prime_ratio, palette='coolwarm')
plt.title('Prime vs Sinistre (Colored by Sinistre/Prime Ratio)')
plt.xlabel('Prime')
plt.ylabel('Sinistre')
plt.show()
