In [1]:
# import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [2]:
# bring in the data sources
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

mouse_metadata_df = pd.read_csv(mouse_metadata_path)
study_results_df = pd.read_csv(study_results_path)

In [3]:
# combine the DataFrames
rawData_df = pd.merge(mouse_metadata_df, study_results_df, on='Mouse ID', how='outer')

# preview the raw DataFrame
rawData_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


## Prepare the Data

In [4]:
# display the count of unique Mouse IDs
print(f"The given dataset contains {rawData_df.nunique()['Mouse ID']} unique instances of Mouse ID.")

The given dataset contains 249 unique instances of Mouse ID.


In [5]:
# check for duplicated timepoints

# create a list of the unique Mouse IDs
uniqueMice = rawData_df['Mouse ID'].unique().tolist()

# create a list of the Mouse IDs
allMice = rawData_df['Mouse ID'].tolist()

# create a copy of the raw data
cleanData_df = rawData_df.copy(deep=True)

# iterate through the list
for mouse in uniqueMice:
    
    # create a DataFrame from the raw data where the Mouse ID matches the iterator
    mouse_df = rawData_df.loc[rawData_df['Mouse ID'] == mouse, :]
    
    # get the maximum count of repeated Timepoint values
    timepoint_duplicate = max(mouse_df['Timepoint'].value_counts().tolist())
    
    # if the count is greater than 1, there is a duplicate
    if timepoint_duplicate > 1:
        print(f'The Mouse ID, {mouse}, contains {timepoint_duplicate}x duplicated Timepoint data.')
        print()
        print(mouse_df.head())
        
        # remove the affected Mouse ID from the raw data
        cleanData_df = cleanData_df.loc[cleanData_df['Mouse ID'] != mouse, :]

The Mouse ID, g989, contains 2x duplicated Timepoint data.

    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
908     g989     Propriva  Female          21          26          0   
909     g989     Propriva  Female          21          26          0   
910     g989     Propriva  Female          21          26          5   
911     g989     Propriva  Female          21          26          5   
912     g989     Propriva  Female          21          26         10   

     Tumor Volume (mm3)  Metastatic Sites  
908           45.000000                 0  
909           45.000000                 0  
910           48.786801                 0  
911           47.570392                 0  
912           51.745156                 0  


In [6]:
# display the cleaned count of unique Mouse IDs
print(f"The given dataset contains {cleanData_df.nunique()['Mouse ID']} unique instances of Mouse ID.")

The given dataset contains 248 unique instances of Mouse ID.


## Generate Summary Statistics

In [10]:
# create the statistical groups
groupedData_mean = cleanData_df.groupby(['Mouse ID']).mean()['Tumor Volume (mm3)']
groupedData_median = cleanData_df.groupby(['Mouse ID']).median()['Tumor Volume (mm3)']
groupedData_std = cleanData_df.groupby(['Mouse ID']).std()['Tumor Volume (mm3)']
groupedData_var = cleanData_df.groupby(['Mouse ID']).var()['Tumor Volume (mm3)']
groupedData_sem = cleanData_df.groupby(['Mouse ID']).sem()['Tumor Volume (mm3)']

In [16]:
# because the five stats groups were grouped by the same column, their index columns will be identical
keys = groupedData_mean.keys()


pd.concat([groupedData_mean, groupedData_median], axis=1)

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3)
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1
a203,56.312708,55.983272
a251,55.237095,55.938620
a262,58.611631,58.572588
a275,52.770965,52.044750
a366,54.639906,55.265582
...,...,...
z435,46.942214,47.115980
z578,38.411510,38.160380
z581,53.979584,53.062175
z795,54.880317,55.613879
