In [0]:
%sh
ls /team5/data

In [1]:
%sh
pip install pandasql


# File 
flatFile.csv


In [3]:
%python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandasql as ps
import sqlite3


df = pd.read_csv('/team5/data/LabeledFile.csv', delimiter='\t', low_memory=False)  

# Basic overview
---


In [5]:
%python
# Basic Dataset Information
print(" DATASET OVERVIEW")
print(f"Total Rows: {len(df)}")
print(f"Total Columns: {len(df.columns)}")
print("\nColumn Names:")
print(", ".join(df.columns))

# Dataset info :
- 4105377 , 31 

# High priority labels : 
- Sinistre
- age_objet_assuree
- age_client
- usage
- anciennete
- classe
- IsToutRisque
- Type_renouvellement_police
# To consider: 
- puissance
- energie
- valeur_venale
- valeur_neuve
- Charge_utile
- demographic Labels

In [7]:
%python
df.head(20)

In [8]:
%python
#  Duplicate Analysis
duplicates = df.duplicated()
print("\n DUPLICATE RECORDS")
print(f"Total Duplicate Rows: {duplicates.sum()}")
if duplicates.sum() > 0:
    print("\nDuplicate Rows Sample:")
    print(df[duplicates].head())



# Duplicates Analysis / Removal
---

In [10]:
%python
print(f"Initial Rows: {df.shape[0]}")
df = df.drop_duplicates()
print(f"Rows After Removing Duplicates: {df.shape}")
# Before  : 4105377
# After : 1631846, 31

- Total Duplicate Rows: 2473531


- Rows before removing dups : 4105377 , After 1631846 
- we went from 4105377 to - 1631846 = 2473531 dropped == Total dups rows

## Now checking for duplicates in the PK 

In [14]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP'])
num_duplicates = duplicates.sum()
print(num_duplicates)


- We've got 1262440 dups of N_SOUSCRIP

In [16]:
%python
unique_values = df['N_SOUSCRIP'].unique()
print(unique_values)



In [17]:
%python
unique_values = df['N_SOUSCRIP'].unique().tolist()
print(unique_values)


In [18]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP', 'year'])
num_duplicates = duplicates.sum()
print(num_duplicates)


- we've got : 1262440 pk duplicated overall
- But 402584 pk duplicated per in the same year

In [20]:
%python
#  Changing display settings
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 


In [21]:
%python
# Example : (in this example it seems that the same guy assured another object, so at least it has meaning)
# We have to remove the ones where the same object is repeated within the same year !

df[(df['N_SOUSCRIP'] == 100681) & (df['year'] == 2018)]

In [22]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP', 'year','N_OBJET_ASS'])
num_duplicates = duplicates.sum()
print(num_duplicates)


- Okay we've got 238982 in this case

In [24]:
%python
df = df.sort_values(by=['N_SOUSCRIP', 'year', 'IsToutRisque'], ascending=[True, True, False])
df = df.drop_duplicates(subset=['N_SOUSCRIP', 'year', 'N_OBJET_ASS'], keep='first')
# Dropping N_SOUSCRIP duplicates, in the same year, for the same N_OBJET_ASS, where we prioritize IsToutRisque to be yes (to not be eliminated) 

In [25]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP','year','N_OBJET_ASS'])
num_duplicates = duplicates.sum()
print(num_duplicates)


In [26]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP','year'])
num_duplicates = duplicates.sum()
print(num_duplicates)


In [27]:
%python
df.shape

Summary : 
we've only dropped the dups where the same N_SOUSCRIP repeats for the same year and for the same N_OBJET_ASS and changing one attribute randomly while doing so ! Maybe the same N_SOUSCRIP updated his insurrance for that object but I dont buy it, they are dropped 
Now we're left with : (1392864, 31) in our df


In [29]:
%python
df[(df['N_SOUSCRIP'] == 642214) & (df['N_OBJET_ASS'] == 'MOTOLRYXCBL0')]

Okay looks like it is less harmful, at least it makes sense now


# Missing Values Analysis / Handling
---

In [32]:
%python
# Missing Value Analysis

missing_values = df.isnull().sum()
missing_percentages = 100 * df.isnull().sum() / len(df)
missing_table = pd.concat([missing_values, missing_percentages], axis=1, keys=['Missing Values', 'Percentage Missing'])

print("\n3. MISSING VALUES")
missing_data = missing_table[missing_table['Missing Values'] > 0]
if len(missing_data) > 0:
    print(missing_data)
else:
    print("No missing values found.")

# carroserie 58.042996 % of missing values, highly doubt it will be of much use



In [33]:
%python
# Unique Values in Categorical Columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print("\n5. CATEGORICAL COLUMN UNIQUE VALUES")
for col in categorical_cols:
    unique_values = df[col].nunique()
    top_values = df[col].value_counts().head(5)
    print(f"\nColumn: {col}")
    print(f"Total Unique Values: {unique_values}")
    print("Top 5 Values:\n", top_values)



In [34]:
%python
# ===== Step 2: Handle Missing Values =====
# Display missing values percentage
missing_info = df.isnull().mean() * 100
print("Missing Values Percentage:")
print(missing_info[missing_info > 0].sort_values(ascending=False))

In [35]:
%python

# Drop columns with >50% missing values
threshold = 50
df = df.loc[:, df.isnull().mean() * 100 < threshold]
print(f"Columns After Dropping >{threshold}% Missing Values: {df.shape[1]}")

In [36]:
%python
df.shape

In [37]:
%python
# fill None with Unknown in category columns
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

In [38]:
%python
# Fill missing values in numerical columns with median
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())


---


In [40]:
%python
df['Type_renouvellement_police'].value_counts(normalize=True) * 100


In [41]:
%python
df['energie'].value_counts(normalize=True) * 100


In [42]:
%python
df['activite'].value_counts(normalize=True) * 100



In [43]:
%python
df['civilite'].value_counts(normalize=True) * 100


In [44]:
%python
df['sexe'].value_counts(normalize=True) * 100


In [45]:
%python
df['IsToutRisque'].value_counts(normalize=True) * 100


In [46]:
%python
df['classe'].value_counts(normalize=True) * 100


In [47]:
%python
df['puissance'].value_counts(normalize=True) * 100


In [48]:
%python
df['usage'].value_counts(normalize=True) * 100


# Summary 
- gender :  M     76.364128 | F     21.944554
- dominant professions : 
    - EDUCATION_FORMATION                   20.024374
    - EMPLOYE                               17.437389
    - PROFESSIONS_MEDICALES                 15.899573
    - RETRAITE                              14.348228
- top two energie types : ES       ES       70.417260 |  DI       29.582309
- Type_renouvellement_police  T      92.145447 | P       7.854409
- it tout risque : No     84.413984 | Yes    15.586016
- classe : 
    - 1.0     33.107196
    - 3.0     17.578502
    - 8.0     14.827522
    - 4.0     13.420365
    - 2.0     12.188269
    - 5.0      6.103828 
- puissance : 
    - 3.0    45.446719
    - 2.0    24.251113
    - 4.0    18.693140
    - 5.0     7.032842
    - 6.0     4.200002 
- usage  : 
    - moto                3.138715
    - u1                 15.997450
    - VP                 78.635458


# Statistics
---

In [51]:
%python
#  Numerical Column Statistics
numerical_cols = df.select_dtypes(include=[np.number]).columns
print("\n6. NUMERICAL COLUMN STATISTICS")
numerical_stats = df[numerical_cols].describe(percentiles=[.25, .5, .75, .90, .99])
print(numerical_stats)



In [52]:
%python
df['classe'].describe(percentiles=[.25, .5, .75, .90, .99]) 

In [53]:
%python
df['classe'].value_counts(normalize=True) * 100


In [54]:
%python
df['Sinistre'].describe(percentiles=[.25, .5, .75, .90, .99])

In [55]:
%python
value_counts = df['Sinistre'].value_counts(normalize=True) * 100
print(value_counts[value_counts > 1])


In [56]:
%python
df['age_objet_assuree'].describe(percentiles=[.25, .5, .75, .90, .99])


# Summary : 
    sinistre : most values are 0.00

In [58]:
%python
print("\n7. POTENTIAL OUTLIER INDICATORS")
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)
    
    # Clamp the bounds within the actual data range
    lower_bound = max(lower_bound, df[col].min())
    upper_bound = min(upper_bound, df[col].max())
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    
    print(f"\nColumn: {col}")
    print(f"Potential Outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")
    print(f"Lower Bound: {lower_bound}")
    print(f"Upper Bound: {upper_bound}")


Column: year
Potential Outliers: 0 (0.00%)
Lower Bound: 2017
Upper Bound: 2022

Column: Prime
Potential Outliers: 49077 (3.52%)
Lower Bound: -232.22470350274736
Upper Bound: 857.6763373397437

Column: Sinistre
Potential Outliers: 85869 (6.16%)
Lower Bound: 0.0
Upper Bound: 240.29225091568549

Column: puissance
Potential Outliers: 63731 (4.58%)
Lower Bound: 1.5
Upper Bound: 5.5

Column: Charge_utile
Potential Outliers: 107031 (7.68%)
Lower Bound: 1.0
Upper Bound: 1.0

Column: place
Potential Outliers: 24512 (1.76%)
Lower Bound: 2.0
Upper Bound: 2.0

Column: classe
Potential Outliers: 16243 (1.17%)
Lower Bound: 0.0
Upper Bound: 8.5



 # Anyway
  sql queries below

In [61]:
%python
# Create an SQLite connection (in-memory database)
conn = sqlite3.connect(":memory:")  # ":memory:" creates an in-memory database
# Save the DataFrame as a SQL table
df.to_sql("FLAT", conn, index=False, if_exists="replace")



In [62]:
%python

query = ''' SELECT N_SOUSCRIP, COUNT(*) AS count, (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM FLAT)) AS percentage FROM FLAT GROUP BY N_SOUSCRIP ORDER BY count DESC LIMIT 10;'''
result = pd.read_sql_query(query, conn)
print(result)

In [63]:
%python

query = "SELECT * FROM FLAT WHERE N_SOUSCRIP = 642214 LIMIT 20"
result = pd.read_sql_query(query, conn)

print(result)



In [64]:
%python

query = '''SELECT * FROM FLAT WHERE year = 2021 AND N_SOUSCRIP = 642214 
           '''

result = pd.read_sql_query(query, conn)

print(result)

In [65]:
%python
query = '''SELECT DISTINCT sexe FROM FLAT 
           '''

result = pd.read_sql_query(query, conn)

print(result)

# Z-score:
 Z-score  greater than 3 or less than -3 might be considered an outlier.

In [67]:
%python

# =====  Handle Outliers =====
from scipy.stats import zscore

# Define a function to remove outliers using Z-score
def remove_outliers_zscore(data, threshold=3):
    z_scores = zscore(data.select_dtypes(include=['number']))
    abs_z_scores = abs(z_scores)
    return data[(abs_z_scores < threshold).all(axis=1)]

print(f"Rows Before Removing Outliers: {df.shape[0]}")
df = remove_outliers_zscore(df)
print(f"Rows After Removing Outliers: {df.shape[0]}")
# 339887 AFTER removing outliers

In [68]:
%python
df.shape

In [69]:
%python
shared_path = '/team5/data/w_df.csv'
df.to_csv(shared_path, index=False) 
#this is the df I saved,I avoided running all the cells,it should be the correct one tho?


# Plots below cause why not

In [71]:
%python

# ===== Step 4: Univariate Analysis =====
import matplotlib.pyplot as plt
import seaborn as sns

# Plot distributions for numerical columns
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30, color='blue')
    plt.title(f"Distribution of {col}")
    plt.show()


In [72]:
%python

# Plot bar plots for categorical columns
for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    df[col].value_counts().head(10).plot(kind='bar', color='orange')
    plt.title(f"Top Categories in {col}")
    plt.show()



In [73]:
%python

# Correlation heatmap
plt.figure(figsize=(12, 8))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()


In [74]:
%python
#  Visualization: Violin Plots
def create_violin_plots(df, numerical_cols):
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(numerical_cols, 1):
        plt.subplot(len(numerical_cols)//2 + 1, 2, i)
        sns.violinplot(x=df[col])
        plt.title(f'Violin Plot of {col}')
    plt.tight_layout()
    plt.show()

create_violin_plots(df, numerical_cols)

# Creating a Risky label 
---


In [76]:
%python
# this is purely based on my hypothesis im no domaine experrt

def classify_risk(row):
    risk_score = 0
    
    # each category with its weight
    age_risk = {
        1: 3,    # Lowest age category
        2: 2,    # Lower-middle age category
        3: 1,    # Middle age category
        4: 1.5,  # Upper middle age category
        5: 2,    # Higher age category
        6: 3     # Highest age category
    }
    risk_score += age_risk.get(row['age_client'], 1.5)
    # if no key is matched in the dict, default value is 1.5
    
    # Engine Power Risk
    puissance_risk = {
        1: 0.5,  # Lowest power category
        2: 1,    # Lower-middle power
        3: 1.5,  # Middle power
        4: 2,    # Upper-middle power
        5: 2.5,  # High power
        6: 3     # Highest power category
    }
    risk_score += puissance_risk.get(row['puissance'], 1.5)
    
    # Market Value Risk
    valeur_venale_risk = {
        1: 3,    # Lowest market value
        2: 2,    # Low market value
        3: 1.5,  # Medium-low market value
        4: 1,    # Medium market value
        5: 0.5,  # Higher market value
        6: 0.25  # Highest market value
    }
    risk_score += valeur_venale_risk.get(row['valeur_venale'], 1.5)
    
    # Vehicle Age Risk
    age_objet_risk = {
        1: 3,    # Very old object
        2: 2.5,  # Old object
        3: 2,    # Moderately old
        4: 1.5,  # Middle-aged
        5: 1,    # Relatively new
        6: 0.5   # Very new object
    }
    risk_score += age_objet_risk.get(row['age_objet_assuree'], 1.5)
    
    # Seniority Risk
    anciennete_risk = {
        1: 3,    # Very new client
        2: 2.5,  # Relatively new client
        3: 2,    # Moderate seniority
        4: 1.5,  # Good seniority
        5: 1,    # Long-term client
        6: 0.5   # Very long-term client
    }
    risk_score += anciennete_risk.get(row['anciennete'], 1.5)
    

    # Vehicle Classification Risk
    classe_risk = {
        1: 3,    # Highest risk classification
        2: 2.5,  # High-risk classification
        3: 2,    # Moderate-high risk
        4: 1.5,  # Moderate risk
        5: 1,    # Lower risk
        6: 0.5   # Lowest risk classification
    }
    risk_score += classe_risk.get(row['classe'], 1.5)
    
    if row['IsToutRisque'] == 'Yes':
        risk_score += 1
    
    if row['usage'] == 'u1' : # high risk if it is not for personal use ? / i assume u1 is not pu
        risk_score += 1
    
    if row['civilite'] in ('Entreprise','Etablissement','Org') : # high risk if it's entreprise I assume
        risk_score += 1

    # if row['Type_renouvellement_police'] ==  ? # im still not sure who is risky here
    
    # if row['energie'] == ?
    
    #if row['marque'] == ?
    
    if row['sexe'] == 'M' : # i think this scientifically proven
        risk_score += 1
    
    # sinsitre + charge_utile ?

        # to add :  
        # Category cols : Usage Risk     # civilite    # Type_renouvellement_police # energie # marque # sexe # IsToutRisque
        # Numeric cols : # sinistre # charge utile
        
# +1 +1 +1 +1
    if risk_score >= 24:
        return 'Extremely High'
    elif 19 <= risk_score < 24:
        return 'High'
    elif 14 <= risk_score < 19:
        return 'Medium-High'
    elif 11 <= risk_score < 14:
        return 'Medium'
    elif 8 <= risk_score < 11:
        return 'Low-Medium'
    else:
        return 'Low'

df_test = df
df_test['Risk_Category'] = df_test.apply(classify_risk, axis=1)




In [77]:
%python
df_test.head()

In [78]:
%python
df_test.to_sql("test", conn, index=False, if_exists="replace")

In [79]:
%python

query = '''SELECT * FROM test WHERE Risk_Category IN ('Medium-High','High','Extremely High')
          LIMIT 3 '''

result = pd.read_sql_query(query, conn)


In [80]:
%python
print(result)

In [81]:
%sh
ls /

