# Amplity Health – NLP Data Analyst Assessment 

## 1. Introduction

## 2. Setup and Imports

Below, we import the necessary libraries ffor data manipulation, 

In [3]:
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Manejo de fechas
from datetime import datetime

## 3. Data Loading

Loading the dataset and taking a quick look a its structure. 

In [6]:
# csv type check
df_dtypes_check = pd.read_csv('Amplity Health - Demographics Dataset.csv')

df_dtypes_check.dtypes

PatientID      int64
Gender        object
Age            int64
Ethnicity     object
State         object
Treatment     object
Pain Level    object
Source        object
Verbatim      object
dtype: object

In [12]:
dtypes = {
    "PatientID": "string",  
    "Gender": "object",
    "Age": "int64",            
    "Ethnicity": "object",
    "State": "object",    
    "Treatment": "object",
    "Pain Level": "object",  
    "Source": "object",   
    "#Docs": "object"        
}

# Load the dataset
df_demographics_dataset = pd.read_csv('Amplity Health - Demographics Dataset.csv', dtype = dtypes)

df_demographics_dataset.dtypes

PatientID     string[python]
Gender                object
Age                    int64
Ethnicity             object
State                 object
Treatment             object
Pain Level            object
Source                object
Verbatim              object
dtype: object

In [8]:
# Display the first rows
print(df_demographics_dataset.head())

           PatientID  Gender  Age  Ethnicity       State Treatment Pain Level  \
0  62129246511442414    Male   73        NaN  California       NaN          6   
1  62129246511442414    Male   72   Hispanic  California       NaN          7   
2  62129246511442414    Male   72        NaN  California       NaN          7   
3  13742511111862220  Female   69        NaN       Texas       NaN          7   
4  24323368642361291  Female   69  Caucasian       Texas       NaN          4   

  Source Verbatim  
0    NaN      NaN  
1    NaN      NaN  
2    NaN      NaN  
3    NaN      NaN  
4    NaN      NaN  


## 4. Age Analysis by Tratement

### 4.1 Raw dataset first insigts

In [17]:
# Dataset dimentions
print(f"Demographic dataset shape: {df_demographics_dataset.shape}")

Demographic dataset shape: (152829, 9)


In [18]:
# Check for missing values
print(df_demographics_dataset.isnull().sum())

PatientID          0
Gender           419
Age                0
Ethnicity      97641
State             18
Treatment      64556
Pain Level    149793
Source        152244
Verbatim      152589
dtype: int64


In [19]:
# Summary statistics
print(df_demographics_dataset.describe())

                 Age
count  152829.000000
mean       49.863848
std        16.883897
min        18.000000
25%        37.000000
50%        50.000000
75%        62.000000
max       103.000000


### 4.2 Change to the corresponding dataframe format

In [20]:
df_max_age = df_demographics_dataset.groupby('PatientID').agg({'Age': 'max'}).reset_index()

      Treatment       mean   min    max   50%
0         Acute  45.025580  18.0  100.0  45.0
1  Preventative  51.421936  18.0  103.0  51.0


In [25]:
# Obtener el índice de la fila con la edad máxima por cada PatientID
idx_max_age = df_demographics_dataset.groupby('PatientID')['Age'].idxmax()

# Seleccionar las filas correspondientes a esos índices
df_max_age = df_demographics_dataset.loc[idx_max_age, ['PatientID', 'Age', 'Treatment']].reset_index(drop=True)

# Calculate statistics for each treatment
age_analysis = df_max_age.describe().reset_index()

# Display the age analysis
print(age_analysis)

   index            Age
0  count  121356.000000
1   mean      50.063969
2    std      17.094464
3    min      18.000000
4    25%      37.000000
5    50%      50.000000
6    75%      62.000000
7    max     103.000000


In [None]:
df_max_age = pd.merge(df_max_age, df_demographics_dataset[['PatientID', 'Treatment']], on='PatientID', how='left')

# Pivot the data to have one row per PatientID, with age and treatment information
df_max_age_pivot = df_max_age.pivot_table(index='PatientID', columns='Treatment', values='Age', aggfunc='max')

# Calculate statistics for each treatment
age_analysis = df_max_age_pivot.describe().T[['mean', 'min', 'max', '50%']].reset_index()

# Display the age analysis
print(age_analysis)

In [21]:
# Types of tratement
print(df_demographics_dataset['Treatment'].value_counts())

Treatment
Preventative    67988
Acute           20285
Name: count, dtype: int64


### 5. Pain Level Anlysis by Tratement