# Maji Ndogo Water Crisis Investigation - Data Analysis

Author: [Shalon Ngobeni]
Date: [Insert Date]
Project: Maji Ndogo Water Services - Water Crisis Intervention


# Part one

In [1]:
%load_ext sql

In [2]:
from sqlalchemy import create_engine, MetaData, Table, text
import pandas as pd
engine = create_engine("mysql+pymysql://root:Qwerty%4012345@127.0.0.1:3306/md_water_services")
metadata = MetaData()

### Description:
We begin by exploring the database schema. The goal is to understand the tables, column names, and data types involved in the water survey.

### Tables of Interest:
- data_dictionary
- employee
- global_water_access
- location
- water_quality
- visits
- water_source
- well_pollution


In [3]:
query = """
SHOW TABLES;
"""

pd.read_sql_query(query, engine)


Unnamed: 0,Tables_in_md_water_services
0,auditor_report
1,data_dictionary
2,employee
3,global_water_access
4,location
5,visits
6,water_quality
7,water_source
8,well_pollution
9,well_pollution_copy


In [4]:
query = """
SELECT * FROM data_dictionary LIMIT 5;
"""

pd.read_sql_query(query, engine)

Unnamed: 0,table_name,column_name,description,datatype,related_to
0,employee,assigned_employee_id,Unique ID assigned to each employee,INT,visits
1,employee,employee_name,Name of the employee,VARCHAR(255),
2,employee,phone_number,Contact number of the employee,VARCHAR(15),
3,employee,email,Email address of the employee,VARCHAR(255),
4,employee,address,Residential address of the employee,VARCHAR(255),



### Dive Into the Water Sources
Explore the types of water sources available in the database via the `water_source` table.


In [5]:
query = """
-- Find distinct water source types
SELECT DISTINCT type_of_water_source 
FROM water_source;
"""

pd.read_sql_query(query, engine)

Unnamed: 0,type_of_water_source
0,tap_in_home
1,tap_in_home_broken
2,well
3,shared_tap
4,river


### Unpack the Visits to Water Sources
Investigate logs in the `visits` table, especially focusing on locations with extremely long queue times (more than 500 minutes).


In [6]:
# Find visits with >500 min queue times
query = """

SELECT *
FROM visits
WHERE time_in_queue > 500;
"""

pd.read_sql_query(query, engine)

Unnamed: 0,record_id,location_id,source_id,time_of_record,visit_count,time_in_queue,assigned_employee_id
0,899,SoRu35083,SoRu35083224,2021-01-16 10:14:00,6,515,28
1,2304,SoKo33124,SoKo33124224,2021-02-06 07:53:00,5,512,16
2,2315,KiRu26095,KiRu26095224,2021-02-06 14:32:00,3,529,8
3,3206,SoRu38776,SoRu38776224,2021-02-20 15:03:00,5,509,46
4,3701,HaRu19601,HaRu19601224,2021-02-27 12:53:00,3,504,0
...,...,...,...,...,...,...,...
100,57408,SoRu35388,SoRu35388224,2023-05-27 08:52:00,5,538,1
101,57832,AkRu04093,AkRu04093224,2023-06-03 07:50:00,3,524,34
102,57843,KiRu30266,KiRu30266224,2023-06-03 16:50:00,2,533,10
103,59129,KiRu27023,KiRu27023224,2023-06-24 16:17:00,2,509,8


In [7]:
#Cross-reference with water_source table
query = """
SELECT 
  v.record_id, 
  v.time_in_queue, 
  ws.type_of_water_source
FROM visits v
JOIN water_source ws ON v.source_id = ws.source_id
WHERE v.time_in_queue > 500;

"""

pd.read_sql_query(query, engine)

Unnamed: 0,record_id,time_in_queue,type_of_water_source
0,899,515,shared_tap
1,2304,512,shared_tap
2,2315,529,shared_tap
3,3206,509,shared_tap
4,3701,504,shared_tap
...,...,...,...
100,57408,538,shared_tap
101,57832,524,shared_tap
102,57843,533,shared_tap
103,59129,509,shared_tap


### Assess the Quality of Water Sources
Review `water_quality` table for inconsistencies ‚Äî particularly perfect score sources (`subjective_quality_score = 10`) with multiple visit entries.


In [8]:
# Find home taps (score=10) visited multiple times
query = """

SELECT *
FROM water_quality
WHERE subjective_quality_score = 10
AND visit_count > 1;
"""

pd.read_sql_query(query, engine)

Unnamed: 0,record_id,subjective_quality_score,visit_count
0,59,10,2
1,67,10,3
2,85,10,4
3,128,10,5
4,137,10,2
...,...,...,...
1521,60041,10,6
1522,60053,10,7
1523,60073,10,7
1524,60086,10,8


### Investigate Pollution Issues
Correct inconsistencies in the `well_pollution` table where "Clean" results are associated with biological contamination.


In [9]:
# Wells marked "Clean" but biologically contaminated
query = """

SELECT *
FROM well_pollution
WHERE results = "Clean"
AND biological > 0.01;
"""

pd.read_sql_query(query, engine)

Unnamed: 0,source_id,date,description,pollutant_ppm,biological,results


# Part 2

## üìò SECTION 1: Cleaning the Employee Data

### üßº Objective 1.1: Generate Email Addresses for Employees

**Goal**:  
Create email addresses for each employee in the `employee` table using their full name.

**Reason**:  
The `email` column is empty, and we need emails to send reports. The format should be:


**Transformation Steps**:
- Replace the space between first and last names with a dot.
- Convert the full name to lowercase.
- Concatenate the domain to form the full email address.


In [10]:
query = """
UPDATE employee
SET email = CONCAT(LOWER(REPLACE(employee_name, ' ', '.')), '@ndogowater.gov');
"""
with engine.connect() as connection:
    connection.execute(text(query))


### üì± Objective 1.2: Clean Up Phone Numbers

**Goal**:  
Remove trailing spaces from the `phone_number` field.

**Reason**:  
Phone numbers have an extra character (length = 13), which causes SMS failures.

**Fix**:  
Use `TRIM()` to remove any leading/trailing whitespace.


In [11]:
query = """
UPDATE employee
SET phone_number = TRIM(phone_number);
"""
with engine.connect() as connection:
    connection.execute(text(query))

In [12]:
query = """
SELECT employee_name, email, phone_number, LENGTH(phone_number) 
FROM employee;
"""
pd.read_sql_query(query, engine)

Unnamed: 0,employee_name,email,phone_number,LENGTH(phone_number)
0,Amara Jengo,amara.jengo@ndogowater.gov,99637993287,12
1,Bello Azibo,bello.azibo@ndogowater.gov,99643864786,12
2,Bakari Iniko,bakari.iniko@ndogowater.gov,99222599041,12
3,Malachi Mavuso,malachi.mavuso@ndogowater.gov,99945849900,12
4,Cheche Buhle,cheche.buhle@ndogowater.gov,99381679640,12
5,Zuriel Matembo,zuriel.matembo@ndogowater.gov,99034075111,12
6,Deka Osumare,deka.osumare@ndogowater.gov,99379364631,12
7,Lalitha Kaburi,lalitha.kaburi@ndogowater.gov,99681623240,12
8,Enitan Zuri,enitan.zuri@ndogowater.gov,99248509202,12
9,Farai Nia,farai.nia@ndogowater.gov,99570082739,12


## üë®‚Äçüîß SECTION 2: Honouring the Workers

### üèòÔ∏è Objective 2.1: Count Employees per Town

**Goal**:  
See how many employees live in each town.

**Reason**:  
Understand field worker distribution and acknowledge efforts in smaller or rural communities.

**Process**:
Group by `town_name` and count entries.


In [13]:
# Top 3 surveyors by visits
query = """

SELECT assigned_employee_id, COUNT(*) AS visit_count
FROM visits
GROUP BY assigned_employee_id
ORDER BY visit_count DESC
LIMIT 3;
"""
pd.read_sql_query(query, engine)

Unnamed: 0,assigned_employee_id,visit_count
0,1,3708
1,30,3676
2,34,3539


### ü•á Objective 2.2: Identify Top 3 Field Surveyors

**Goal**:  
Recognize the three field employees who visited the most locations.

**Reason**:  
Reward high-effort contributors and boost morale.

**Process**:
- Count visits per `assigned_employee_id`.
- Join to `employee` table to get their contact details.


In [14]:
# Get contact details
query = """

SELECT employee_name, email, phone_number
FROM employee
WHERE assigned_employee_id IN (
    SELECT assigned_employee_id
    FROM (
        SELECT assigned_employee_id, COUNT(*) AS visit_count
        FROM visits
        GROUP BY assigned_employee_id
        ORDER BY visit_count DESC
        LIMIT 3
    ) AS top_employees  -- Derived table alias
);
"""
pd.read_sql_query(query, engine)

Unnamed: 0,employee_name,email,phone_number
0,Bello Azibo,bello.azibo@ndogowater.gov,99643864786
1,Pili Zola,pili.zola@ndogowater.gov,99822478933
2,Rudo Imani,rudo.imani@ndogowater.gov,99046972648


## üìç SECTION 3: Analysing Locations

### üß≠ Objective 3.1: Count Records per Town

**Goal**:  
Understand how many data records were collected per town.

**Reason**:  
This helps identify towns with the most surveyed water sources and validate survey coverage.

**Process**:
Group the `location` table by `town_name`, count the number of records, and sort the results in descending order.


In [15]:
query = """

SELECT town_name, COUNT(*) AS records_per_town FROM location GROUP BY town_name ORDER BY records_per_town DESC;
"""
pd.read_sql_query(query, engine)

Unnamed: 0,town_name,records_per_town
0,Rural,23740
1,Harare,1650
2,Amina,1090
3,Lusaka,1070
4,Mrembo,990
5,Asmara,930
6,Dahabu,930
7,Kintampo,780
8,Ilanga,780
9,Isiqalo,770


### üó∫Ô∏è Objective 3.2: Count Records per Province

**Goal**:  
See how water source data is distributed across provinces.

**Reason**:  
Ensures that all regions are well represented in the dataset.

**Process**:
Group the `location` table by `province_name`, count entries, and sort the results.


In [16]:
query = """
SELECT 
    province_name, 
    COUNT(*) AS records_per_province
FROM location
GROUP BY province_name
ORDER BY records_per_province DESC;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,province_name,records_per_province
0,Kilimani,9510
1,Akatsi,8940
2,Sokoto,8220
3,Amanzi,6950
4,Hawassa,6030


### üóÇÔ∏è Objective 3.3: Group by Province & Town

**Goal**:  
Get a combined view of how many records exist in each town within each province.

**Reason**:  
Identifies which towns are the most critical within each province in terms of water source data.

**Process**:
- Group by both `province_name` and `town_name`
- Count the number of records per combination
- Sort the data by `province_name`, and within that, sort towns by `records_per_town` descending


In [17]:
query = """

-- Records per province/town (ordered)
SELECT 
    province_name, 
    town_name, 
    COUNT(*) AS records
FROM 
    location
GROUP BY 
    province_name, 
    town_name
ORDER BY 
province_name, 
records DESC;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,province_name,town_name,records
0,Akatsi,Rural,6290
1,Akatsi,Lusaka,1070
2,Akatsi,Harare,800
3,Akatsi,Kintampo,780
4,Amanzi,Rural,3100
5,Amanzi,Asmara,930
6,Amanzi,Dahabu,930
7,Amanzi,Amina,670
8,Amanzi,Pwani,520
9,Amanzi,Abidjan,400


### üßÆ Objective 3.4: Calculate Rural vs Urban Percentage

**Goal**:  
Understand what percentage of sources are in rural areas vs. urban ones.

**Reason**:  
Helps prioritize investments ‚Äî the rural areas may need more attention if underserved.

**Process**:
- Count the number of records for each `location_type`
- Calculate percentages using total records


In [18]:
# Get counts per location type
query = """
SELECT 
    location_type, 
    COUNT(*) AS num_sources
FROM location
GROUP BY location_type;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,location_type,num_sources
0,Urban,15910
1,Rural,23740


##  SECTION 4: Diving Into Water Sources

### üíß Objective 4.1: List All Unique Water Source Types

**Goal**:  
Identify and understand the different types of water sources in the dataset.

**Reason**:  
Each type (e.g., river, well, shared tap) has different levels of accessibility and contamination risk. Understanding these types helps prioritize infrastructure improvements.

**Process**:
- Query the `water_source` table
- Extract distinct values from `type_of_water_source`


In [19]:
query = """
SELECT DISTINCT type_of_water_source
FROM water_source;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,type_of_water_source
0,tap_in_home
1,tap_in_home_broken
2,well
3,shared_tap
4,river


### üßÆ Objective 4.2: Count Records per Water Source Type

**Goal**:  
See how common each water source type is.

**Reason**:  
Determines reliance on certain water source types and helps evaluate which infrastructure is most prevalent.

**Process**:
- Group by `type_of_water_source`
- Count the number of records per type


In [20]:
query = """
SELECT 
    type_of_water_source, 
    COUNT(*) AS total_records
FROM water_source
GROUP BY type_of_water_source
ORDER BY total_records DESC;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,type_of_water_source,total_records
0,well,17383
1,tap_in_home,7265
2,tap_in_home_broken,5856
3,shared_tap,5767
4,river,3379


### üë®‚Äçüë©‚Äçüëß Objective 4.3: Sum of People Served per Water Source Type

**Goal**:  
Find out how many people are served by each type of water source.

**Reason**:  
Highlights the population impact and burden on each source type, helping prioritize upgrades.

**Process**:
- Group by `type_of_water_source`
- Sum the `number_of_people_served` column


In [21]:
query = """
SELECT 
    type_of_water_source,
    SUM(number_of_people_served) AS people_served
FROM water_source
GROUP BY type_of_water_source
ORDER BY people_served DESC;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,type_of_water_source,people_served
0,shared_tap,11945272.0
1,well,4841724.0
2,tap_in_home,4678880.0
3,tap_in_home_broken,3799720.0
4,river,2362544.0


### ‚è≥ Objective 4.4: Join Water Source & Visits to Analyze Queue Times

**Goal**:  
See which water source types have the longest waiting times.

**Reason**:  
Long queues = inefficiency and public dissatisfaction. Shared taps and rivers are suspected of long waits.

**Process**:
- Join `water_source` and `visits` tables on `source_id`
- Group by `type_of_water_source`
- Calculate average `time_in_queue`


In [22]:
query = """
SELECT 
    ws.type_of_water_source,
    ROUND(AVG(v.time_in_queue), 1) AS avg_queue_time
FROM water_source ws
JOIN visits v
ON ws.source_id = v.source_id
GROUP BY ws.type_of_water_source
ORDER BY avg_queue_time DESC;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,type_of_water_source,avg_queue_time
0,shared_tap,136.9
1,river,17.0
2,tap_in_home,0.0
3,tap_in_home_broken,0.0
4,well,0.0


## üßæ SECTION 5: Unpacking the Visits to Water Sources

### ‚è±Ô∏è Objective 5.1: Find Extremely Long Queue Times (> 500 minutes)

**Goal**:  
Identify all visit records where the `time_in_queue` exceeds 500 minutes.

**Reason**:  
These extreme values highlight high-pressure locations that may require immediate intervention or further investigation.

**Process**:
- Query the `visits` table
- Filter for `time_in_queue > 500`


In [23]:
query = """
SELECT *
FROM visits
WHERE time_in_queue > 500;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,record_id,location_id,source_id,time_of_record,visit_count,time_in_queue,assigned_employee_id
0,899,SoRu35083,SoRu35083224,2021-01-16 10:14:00,6,515,28
1,2304,SoKo33124,SoKo33124224,2021-02-06 07:53:00,5,512,16
2,2315,KiRu26095,KiRu26095224,2021-02-06 14:32:00,3,529,8
3,3206,SoRu38776,SoRu38776224,2021-02-20 15:03:00,5,509,46
4,3701,HaRu19601,HaRu19601224,2021-02-27 12:53:00,3,504,0
...,...,...,...,...,...,...,...
100,57408,SoRu35388,SoRu35388224,2023-05-27 08:52:00,5,538,1
101,57832,AkRu04093,AkRu04093224,2023-06-03 07:50:00,3,524,34
102,57843,KiRu30266,KiRu30266224,2023-06-03 16:50:00,2,533,10
103,59129,KiRu27023,KiRu27023224,2023-06-24 16:17:00,2,509,8


### üîÅ Objective 5.2: Identify Water Source Types for Long-Queue Visits

**Goal**:  
Determine which types of water sources are linked to the long-queue visits found in Objective 5.1.

**Reason**:  
To link queue problems to infrastructure types (e.g., shared taps vs wells).

**Process**:
- Use the `source_id` from previous results
- Join with the `water_source` table
- Return the `type_of_water_source`


In [24]:
# First, get unique source_ids from high-wait visits
query = """
SELECT DISTINCT 
    ws.source_id, 
    ws.type_of_water_source
FROM 
    water_source ws
JOIN visits v ON ws.source_id = v.source_id
WHERE v.time_in_queue > 500;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,source_id,type_of_water_source
0,SoRu35083224,shared_tap
1,SoKo33124224,shared_tap
2,KiRu26095224,shared_tap
3,SoRu38776224,shared_tap
4,HaRu19601224,shared_tap
...,...,...
100,SoRu35388224,shared_tap
101,AkRu04093224,shared_tap
102,KiRu30266224,shared_tap
103,KiRu27023224,shared_tap


### üîÅ Objective 5.3 (Optional): Average Queue Times by Source ID

**Goal**:  
Break down average queue time for each individual water source.

**Reason**:  
Allows granular inspection ‚Äî not just by type but by specific location, useful for field action plans.

**Process**:
- Group the `visits` table by `source_id`
- Calculate the average `time_in_queue`


In [25]:
query = """
SELECT 
    source_id,
    ROUND(AVG(time_in_queue), 1) AS avg_queue_time
FROM visits
GROUP BY source_id
ORDER BY avg_queue_time DESC
LIMIT 10;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,source_id,avg_queue_time
0,SoRu34770224,279.6
1,KiRu25391224,279.0
2,AkRu05131224,276.0
3,HaRu20126224,275.0
4,HaRu19574224,272.6
5,AkRu04807224,271.3
6,KiRu25504224,270.5
7,HaRu20440224,268.3
8,SoRu37419224,268.1
9,SoRu36631224,267.3


##  SECTION 6: Water Quality & Survey Inconsistencies

### üéØ Objective 6.1: Find High-Quality Water Sources That Were Visited More Than Once

**Goal**:  
Detect suspicious survey patterns where water sources with a perfect quality score (`10`) have multiple visits.

**Reason**:  
Surveyors reported that good sources, especially home taps, should only require **one** visit. More than one could mean duplicated effort or data entry error.

**Process**:
- Query the `water_quality` table
- Filter where `subjective_quality_score = 10` **and** `visit_count > 1`


In [26]:
query = """
SELECT *
FROM water_quality
WHERE subjective_quality_score = 10
AND visit_count > 1;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,record_id,subjective_quality_score,visit_count
0,59,10,2
1,67,10,3
2,85,10,4
3,128,10,5
4,137,10,2
...,...,...,...
1521,60041,10,6
1522,60053,10,7
1523,60073,10,7
1524,60086,10,8


### üßΩ Objective 6.2: Focus on "Tap in Home" Sources With Multiple Visits

**Goal**:  
Narrow down the quality check to **home taps**, which are expected to have low revisit rates.

**Reason**:  
Home taps are assumed to be reliable ‚Äî multiple visits may signal duplicate records or systemic recording errors.

**Process**:
- Join `water_quality` with `water_source` using `source_id`
- Filter for:
  - `type_of_water_source = 'tap_in_home'`
  - `subjective_quality_score = 10`
  - `visit_count > 1`


In [27]:
query = """
SELECT
    wq.record_id,
    wq.subjective_quality_score,
    wq.visit_count,
    ws.type_of_water_source,
    v.time_of_record,
    v.assigned_employee_id
FROM
    water_quality AS wq
JOIN
    visits AS v ON wq.record_id = v.record_id
JOIN
    water_source AS ws ON v.source_id = ws.source_id
WHERE
    wq.subjective_quality_score = 10
    AND wq.visit_count > 1
    AND ws.type_of_water_source = 'tap_in_home';
"""
pd.read_sql_query(query, engine)


Unnamed: 0,record_id,subjective_quality_score,visit_count,type_of_water_source,time_of_record,assigned_employee_id


##  SECTION 7: Investigating Well Pollution & Cleaning Data Errors

### üö® Objective 7.1: Identify Wells Marked "Clean" Despite Biological Contamination

**Goal**:  
Uncover inconsistencies where wells are marked as `Clean`, but `biological` contamination exceeds safe levels.

**Reason**:  
0.01 CFU/mL is the threshold for biological contamination. Anything above that should not be labeled as clean.

**Process**:
- Query the `well_pollution` table
- Filter where `results = 'Clean'` and `biological > 0.01`


In [28]:
query = """
SELECT *
FROM well_pollution
WHERE results = 'Clean'
AND biological > 0.01;
"""
pd.read_sql_query(query, engine)


Unnamed: 0,source_id,date,description,pollutant_ppm,biological,results


### üßæ Objective 7.2: Detect Description Fields That Falsely Start with "Clean"

**Goal**:  
Spot textual descriptions that start with "Clean" but contain evidence of contaminants.

**Reason**:  
Surveyors mistakenly copied "Clean" into descriptions even when bacteria was found.

**Process**:
- Use SQL `LIKE` to find `description` fields beginning with `"Clean "` followed by more characters


In [29]:
query = """
SELECT *
FROM well_pollution
WHERE description LIKE 'Clean %%';
"""
pd.read_sql_query(query, engine)


Unnamed: 0,source_id,date,description,pollutant_ppm,biological,results


### üõ°Ô∏è Objective 7.3: Create Temporary Backup Before Fixing the Data

**Goal**:  
Avoid corrupting the original `well_pollution` table by creating a safe copy to apply fixes.

**Process**:
- Create a temporary copy called `well_pollution_copy`


In [30]:
query = """
CREATE TABLE IF NOT EXISTS md_water_services.well_pollution_copy AS
SELECT * FROM md_water_services.well_pollution;
"""
with engine.connect() as connection:
    connection.execute(text(query))


### üßπ Objective 7.4: Fix Misleading Descriptions in `well_pollution_copy`

**Goal**:  
Remove the incorrect "Clean" prefix from bacterial descriptions.

**Process**:
Run targeted `UPDATE` queries to fix known bad descriptions.


In [31]:

with engine.connect() as connection:
    connection.execute(text("""
UPDATE well_pollution_copy
SET description = 'Bacteria: E. coli'
WHERE description = 'Clean Bacteria: E. coli';
"""))
    
with engine.connect() as connection:
    connection.execute(text("""
UPDATE well_pollution_copy
SET description = 'Bacteria: Giardia Lamblia'
WHERE description = 'Clean Bacteria: Giardia Lamblia';
"""))

### üîÑ Objective 7.5: Fix the "results" Column Where Data Indicates Contamination

**Goal**:  
Correct `results` field to ‚ÄúContaminated: Biological‚Äù where `biological > 0.01` and `results = 'Clean'`.

**Process**:
Run an `UPDATE` query on the copy table.


In [32]:
query = """
UPDATE well_pollution_copy
SET results = 'Contaminated: Biological'
WHERE biological > 0.01 AND results = 'Clean';
"""

with engine.connect() as connection:
    connection.execute(text(query))

### üîç Objective 7.6: Run a Final Check to Confirm No Errors Remain

**Goal**:  
Double-check that:
- No `results = 'Clean'` exist where `biological > 0.01`
- No descriptions start with `"Clean "` anymore

**Process**:
Run a test query on `well_pollution_copy`


In [33]:
query = """
SELECT *
FROM well_pollution_copy
WHERE description LIKE 'Clean %%'
   OR (results = 'Clean' AND biological > 0.01);
"""
pd.read_sql_query(query, engine)


Unnamed: 0,source_id,date,description,pollutant_ppm,biological,results


# Part 3


## Maji Ndogo Water Project Analysis: Complete Breakdown
1. Objectives
- Primary Goal: Assess the integrity and accuracy of water source data

- Secondary Goal: Identify any data tampering or inconsistencies

- Tertiary Goal: Compare auditor findings with original survey data

- Investigation Goal: Identify potential corruption among field employees

#### Database Structure Analysis
Method:

Generated ERD for md_water_services

Fixed incorrect cardinality between visits and water_quality to one-to-one

üìå C. Score Comparison Analysis
Method:

Joined auditor_report, visits, and water_quality tables

Compared auditor score with surveyor score

Code:

In [34]:
query = """
SELECT
    auditor_report.location_id,
    auditor_report.true_water_source_score AS auditor_score,
    wq.subjective_quality_score AS surveyor_score
FROM auditor_report
JOIN visits ON auditor_report.location_id = visits.location_id
JOIN water_quality AS wq ON visits.record_id = wq.record_id
WHERE visits.visit_count = 1
AND auditor_report.true_water_source_score = wq.subjective_quality_score;

"""
pd.read_sql_query(query, engine)

Unnamed: 0,location_id,auditor_score,surveyor_score
0,AkHa00008,3,3
1,AkHa00058,3,3
2,AkHa00068,3,3
3,AkHa00073,3,3
4,AkHa00088,1,1
...,...,...,...
1513,SoRu39501,3,3
1514,SoRu39524,9,9
1515,SoRu39588,0,0
1516,SoRu39617,9,9


Identified mismatches between auditor and surveyor scores

In [35]:
query = """
SELECT
    auditor_report.location_id,
    auditor_report.true_water_source_score AS auditor_score,
    wq.subjective_quality_score AS surveyor_score
FROM auditor_report
JOIN visits ON auditor_report.location_id = visits.location_id
JOIN water_quality AS wq ON visits.record_id = wq.record_id
WHERE visits.visit_count = 1
AND auditor_report.true_water_source_score != wq.subjective_quality_score;
"""
pd.read_sql_query(query, engine)

Unnamed: 0,location_id,auditor_score,surveyor_score
0,AkHa00053,9,10
1,AkHa00311,9,10
2,AkHa00314,9,10
3,AkHa00363,0,10
4,AkKi00847,3,10
...,...,...,...
97,SoRu38045,9,10
98,SoRu38331,3,10
99,SoRu38401,2,10
100,SoRu38535,0,10


Created a reusable view for incorrect (mismatched) records

In [39]:
query = """
SELECT
    auditorRep.location_id,
    visitsTbl.record_id,
    Empl_Table.employee_name,
    auditorRep.true_water_source_score AS auditor_score,
    wq.subjective_quality_score AS employee_score
FROM auditor_report AS auditorRep
JOIN visits AS visitsTbl
ON auditorRep.location_id = visitsTbl.location_id
JOIN water_quality AS wq
ON visitsTbl.record_id = wq.record_id
JOIN employee as Empl_Table
ON Empl_Table.assigned_employee_id = visitsTbl.assigned_employee_id;

"""
pd.read_sql_query(query, engine)

Unnamed: 0,location_id,record_id,employee_name,auditor_score,employee_score
0,AkHa00008,57788,Lalitha Kaburi,3,3
1,AkHa00053,28869,Malachi Mavuso,9,10
2,AkHa00058,11935,Lesedi Kofi,3,3
3,AkHa00068,17343,Sanaa Tendaji,3,3
4,AkHa00073,13554,Thandiwe Kito,3,3
...,...,...,...,...,...
2693,SoRu39524,47529,Bakari Iniko,9,9
2694,SoRu39544,17929,Gamba Shani,0,10
2695,SoRu39588,11356,Cheche Buhle,0,0
2696,SoRu39617,11689,Enitan Zuri,9,9


In [44]:
query = """
CREATE VIEW Incorrect_records AS
SELECT
    auditor_report.location_id,
    auditor_report.true_water_source_score AS auditor_score,
    wq.subjective_quality_score AS surveyor_score,
    visits.assigned_employee_id,
    employee.employee_name,
    auditor_report.statements
FROM auditor_report
JOIN visits ON auditor_report.location_id = visits.location_id
JOIN water_quality AS wq ON visits.record_id = wq.record_id
JOIN employee ON visits.assigned_employee_id = employee.assigned_employee_id
WHERE visits.visit_count = 1
AND auditor_report.true_water_source_score != wq.subjective_quality_score;

"""
with engine.connect() as connection:
    connection.execute(text(query)) 

In [45]:
query = """SELECT
    employee_name,
    COUNT(*) AS number_of_mistakes
FROM Incorrect_records
GROUP BY employee_name
ORDER BY number_of_mistakes DESC;
"""
pd.read_sql_query(query, engine)

Unnamed: 0,employee_name,number_of_mistakes
0,Bello Azibo,26
1,Malachi Mavuso,21
2,Zuriel Matembo,17
3,Lalitha Kaburi,7
4,Rudo Imani,5
5,Farai Nia,4
6,Enitan Zuri,4
7,Makena Thabo,3
8,Gamba Shani,3
9,Yewande Ebele,3


In [47]:
query = """
WITH mistake_counts AS (
    SELECT
        employee_name,
        COUNT(*) AS number_of_mistakes
    FROM Incorrect_records
    GROUP BY employee_name
),
avg_mistakes AS (
    SELECT AVG(number_of_mistakes) AS avg_count
    FROM mistake_counts
)
SELECT employee_name
FROM mistake_counts
WHERE number_of_mistakes = (
    SELECT MAX(number_of_mistakes)
    FROM mistake_counts
    WHERE number_of_mistakes < (SELECT avg_count FROM avg_mistakes)
);
"""
pd.read_sql_query(query, engine)

Unnamed: 0,employee_name
0,Rudo Imani
