In [6]:
# CELL 1 – Clone + enter repo
!git clone https://github.com/redecon/insurance-analytics-challenge.git
%cd insurance-analytics-challenge

Cloning into 'insurance-analytics-challenge'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 26 (delta 3), reused 25 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (26/26), 217.34 KiB | 2.31 MiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/insurance-analytics-challenge


In [7]:
# CELL 2 – Set identity + create task-3 branch
!git config --global user.email "redietbekele02@outlook.com"   # ← CHANGE THIS
!git config --global user.name "Rediet"
!git checkout main
!git pull origin main
!git checkout -b task-3

Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'
From https://github.com/redecon/insurance-analytics-challenge
 * branch            main       -> FETCH_HEAD
Already up to date.
Switched to a new branch 'task-3'


In [10]:
# CELL 3 – Install DVC + RE-INITIALIZE + pull data
!pip install dvc==2.58.2 --quiet
!rm -rf .dvc  # removes broken DVC state (safe)
!dvc init --force   # re-initializes cleanly
!dvc remote add -d localstorage dvc_storage   # re-add your local remote
!dvc pull -f
!ls -lh /content/MachineLearningRating_v3.txt

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0mSetting 'localstorage' as a default remote.
Everything is up to date

In [3]:
#
from google.colab import files
uploaded = files.upload()

Saving MachineLearningRating_v3.txt to MachineLearningRating_v3.txt


In [12]:
# 4
import pandas as pd, numpy as np
from scipy import stats
import warnings, seaborn as sns, matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

df = pd.read_csv('/content/MachineLearningRating_v3.txt', sep='|', low_memory=False, on_bad_lines='skip')
df['TotalPremium'] = df['TotalPremium'].fillna(df['TotalPremium'].median())
df['TotalClaims']   = df['TotalClaims'].fillna(0)
df['Province']      = df['Province'].fillna('Unknown')
df['PostalCode']    = df['PostalCode'].fillna(0).astype(int)
df['Gender']        = df['Gender'].str.strip().fillna('Unknown')

df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
df['Margin']   = df['TotalPremium'] - df['TotalClaims']

# QUICK HYPOTHESIS TESTS (all 4)
print("TASK 3 – HYPOTHESIS TESTING RESULTS\n" + "="*60)

# 1. Province frequency
chi2, p1, _, _ = stats.chi2_contingency(pd.crosstab(df['Province'], df['HasClaim']))
print(f"1. Province frequency → p = {p1:.2e} → {'REJECT H0' if p1<0.05 else 'Fail'}")

# 2. Province severity
groups = [g['TotalClaims'].values for _,g in df[df['TotalClaims']>0].groupby('Province') if len(g)>10]
f, p2 = stats.f_oneway(*groups)
print(f"2. Province severity   → p = {p2:.2e} → {'REJECT H0' if p2<0.05 else 'Fail'}")

# 3. Top 2 postal codes
top2 = df['PostalCode'].value_counts().head(2).index
chi2z, p3, _, _ = stats.chi2_contingency(pd.crosstab(df['PostalCode'].isin(top2), df['HasClaim']))
print(f"3. Top 2 Zip frequency → p = {p3:.2e} → {'REJECT H0' if p3<0.05 else 'Fail'}")

# 4. Gender
chi2g, p4, _, _ = stats.chi2_contingency(pd.crosstab(df['Gender'].isin(['Male','Female']), df['HasClaim']).iloc[:2,:])
print(f"4. Gender              → p = {p4:.2e} → {'REJECT H0' if p4<0.05 else 'Fail'}")

# Final table
results = pd.DataFrame({
    'Hypothesis': ['Province (freq)','Province (sev)','Top2 Zip (freq)','Gender'],
    'p-value': [f"{p1:.2e}",f"{p2:.2e}",f"{p3:.2e}",f"{p4:.2e}"],
    'Decision': ['REJECT H0' if p<0.05 else 'Fail' for p in [p1,p2,p3,p4]]
})
display(results)
results.to_csv('task3_results.csv', index=False)

TASK 3 – HYPOTHESIS TESTING RESULTS
1. Province frequency → p = 5.93e-19 → REJECT H0
2. Province severity   → p = 3.70e-06 → REJECT H0
3. Top 2 Zip frequency → p = 6.08e-20 → REJECT H0
4. Gender              → p = 9.47e-03 → REJECT H0


Unnamed: 0,Hypothesis,p-value,Decision
0,Province (freq),5.93e-19,REJECT H0
1,Province (sev),3.7e-06,REJECT H0
2,Top2 Zip (freq),6.08e-20,REJECT H0
3,Gender,0.00947,REJECT H0


In [13]:
# CELL 5 – Final commit & push
!git remote set-url origin https://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@github.com/redecon/insurance-analytics-challenge.git

!git add .
!git commit -m "feat(task-3): complete 4 hypothesis tests + results table + business recommendations"
!git push origin task-3 --force
print("TASK 3 FINISHED & PUSHED!")

[task-3 582c3a7] feat(task-3): complete 4 hypothesis tests + results table + business recommendations
 4 files changed, 15 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 task3_results.csv
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (7/7), 823 bytes | 823.00 KiB/s, done.
Total 7 (delta 0), reused 4 (delta 0), pack-reused 0
remote: 
remote: Create a pull request for 'task-3' on GitHub by visiting:[K
remote:      https://github.com/redecon/insurance-analytics-challenge/pull/new/task-3[K
remote: 
To https://github.com/redecon/insurance-analytics-challenge.git
 * [new branch]      task-3 -> task-3
TASK 3 FINISHED & PUSHED!


In [14]:
# CELL 2 – Save & copy your Task 2 and Task 3 notebooks into the repo
# Run this cell → it will ask you to upload both notebooks
from google.colab import files
print("Please upload your Task-2 DVC notebook (.ipynb)")
task2 = files.upload()
print("Please upload your Task-3 Hypothesis Testing notebook (.ipynb)")
task3 = files.upload()

# Move them into the repo with clean names
!mv *.ipynb ./ 2>/dev/null || true
!mv *task*2*.ipynb task2_dvc_setup.ipynb 2>/dev/null || true
!mv *task*3*.ipynb task3_hypothesis_testing.ipynb 2>/dev/null || true
!ls -la *.ipynb

Please upload your Task-2 DVC notebook (.ipynb)


Saving Task2.ipynb to Task2.ipynb
Please upload your Task-3 Hypothesis Testing notebook (.ipynb)


-rw-r--r-- 1 root root    68 Dec  8 15:31 task1_eda_complete.ipynb
-rw-r--r-- 1 root root 20005 Dec  8 15:51 Task2.ipynb


In [18]:
#
!git add *.ipynb *.csv 2>/dev/null || true
!git status
!git commit -m "docs: add Task 2 (DVC) and Task 3 (Hypothesis Testing) notebooks + results"

On branch task-3
Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mnew file:   Task2.ipynb[m

[task-3 7e107d3] docs: add Task 2 (DVC) and Task 3 (Hypothesis Testing) notebooks + results
 1 file changed, 424 insertions(+)
 create mode 100644 Task2.ipynb


In [19]:
#
!git push origin task-3 --force
print("TASK 2 & TASK 3 NOTEBOOKS + RESULTS SUCCESSFULLY PUSHED!")

Enumerating objects: 4, done.
Counting objects:  25% (1/4)Counting objects:  50% (2/4)Counting objects:  75% (3/4)Counting objects: 100% (4/4)Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)Writing objects:  66% (2/3)Writing objects: 100% (3/3)Writing objects: 100% (3/3), 3.94 KiB | 3.94 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas:   0% (0/1)[Kremote: Resolving deltas: 100% (1/1)[Kremote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/task-3.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot c

In [20]:
# CELL 2 – Remove the dangerous cell from Task2.ipynb (automatically)
import json
with open('Task2.ipynb', 'r') as f:
    nb = json.load(f)

# Loop through all cells and delete any cell containing "github_pat"
cleaned_cells = []
for cell in nb['cells']:
    if any("github_pat" in line for line in cell.get('source', [])):
        print("Found and REMOVED cell with secret")
    else:
        cleaned_cells.append(cell)

nb['cells'] = cleaned_cells

with open('Task2_clean.ipynb', 'w') as f:
    json.dump(nb, f)

# Replace the old notebook
!mv Task2_clean.ipynb Task2.ipynb

Found and REMOVED cell with secret


In [21]:
# CELL 3 – Commit the cleaned version + push (NOW IT WILL WORK)
!git add Task2.ipynb
!git commit --amend -m "docs: add Task 2 & Task 3 notebooks (secrets removed)"
!git push origin task-3 --force
print("PUSH SUCCESSFUL – NO MORE SECRETS!")

[task-3 c36c21a] docs: add Task 2 & Task 3 notebooks (secrets removed)
 Date: Mon Dec 8 15:55:34 2025 +0000
 1 file changed, 1 insertion(+)
 create mode 100644 Task2.ipynb
Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 3.31 KiB | 3.31 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/redecon/insurance-analytics-challenge.git
   582c3a7..c36c21a  task-3 -> task-3
PUSH SUCCESSFUL – NO MORE SECRETS!
