## Observations and Insights 

In [35]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
mouse_study_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')
mouse_study_df.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [36]:
# Checking the number of mice.
mouse_study_df['Mouse ID'].value_counts()


g989    13
k403    10
j365    10
j984    10
k210    10
        ..
v199     1
t573     1
f932     1
b447     1
u153     1
Name: Mouse ID, Length: 249, dtype: int64

In [37]:
mouse_study_df.count()


Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [38]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse = mouse_study_df.loc[mouse_study_df.duplicated(subset=["Mouse ID", "Timepoint",]),"Mouse ID"].unique()

duplicate_mouse


array(['g989'], dtype=object)

In [39]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mouse = mouse_study_df.loc[(mouse_study_df['Mouse ID'] == 'g989')] 

duplicate_mouse


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [40]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_study_df = mouse_study_df.loc[(mouse_study_df['Mouse ID'] != 'g989')] 
cleaned_study_df 


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [41]:
# Checking the number of mice in the clean DataFrame.
cleaned_study_df['Mouse ID'].value_counts()


k403    10
o287    10
j984    10
k210    10
k382    10
        ..
h428     1
o848     1
t573     1
d133     1
x226     1
Name: Mouse ID, Length: 248, dtype: int64

## Summary Statistics

In [52]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

regimens_df = cleaned_study_df.groupby("Drug Regimen")
tumor_mean = round(regimens_df['Tumor Volume (mm3)'].mean(), 2)
tumor_median = round(regimens_df['Tumor Volume (mm3)'].median(), 2)
tumor_variance = round(regimens_df['Tumor Volume (mm3)'].var(),2)
tumor_std = round(regimens_df['Tumor Volume (mm3)'].std(), 2)
tumor_sem = round(regimens_df['Tumor Volume (mm3)'].sem(ddof=0), 4)

summary_stat_1 = {"Mean": tumor_mean,
                "Median": tumor_median,
                "Variance": tumor_variance,
                "Std Deviation": tumor_std,
                "SEM": tumor_sem}

summary_stat_1_df = pd.DataFrame(summary_stat_1)
summary_stat_1_df.columns=pd.MultiIndex.from_product([['Tumor Volume Summary Stats'],summary_stat_1_df.columns])
summary_stat_1_df


Unnamed: 0_level_0,Tumor Volume Summary Stats,Tumor Volume Summary Stats,Tumor Volume Summary Stats,Tumor Volume Summary Stats,Tumor Volume Summary Stats
Unnamed: 0_level_1,Mean,Median,Variance,Std Deviation,SEM
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.68,41.56,24.95,4.99,0.3286
Ceftamin,52.59,51.78,39.29,6.27,0.4685
Infubinol,52.88,51.82,43.13,6.57,0.4909
Ketapril,55.24,53.7,68.55,8.28,0.6023
Naftisol,54.33,52.51,66.17,8.13,0.5949
Placebo,54.03,52.29,61.17,7.82,0.5797
Propriva,52.32,50.45,43.85,6.62,0.5425
Ramicane,40.22,40.67,23.49,4.85,0.3202
Stelasyn,54.23,52.43,59.45,7.71,0.5715
Zoniferol,53.24,51.82,48.53,6.97,0.515


In [53]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
summary_stat_2_df = cleaned_study_df.groupby(cleaned_study_df["Drug Regimen"])['Tumor Volume (mm3)'].agg(["mean", 
                                               "median", 
                                               "var", 
                                               "std", 
                                               "sem"])



# Clean Dataframe
summary_stat_2_df["mean"] = summary_stat_2_df["mean"].map("{:,.2f}".format)
summary_stat_2_df["median"] = summary_stat_2_df["median"].map("{:,.2f}".format)
summary_stat_2_df["var"] = summary_stat_2_df["var"].map("{:,.2f}".format)
summary_stat_2_df["std"] = summary_stat_2_df["std"].map("{:,.2f}".format)
summary_stat_2_df["sem"] = summary_stat_2_df["sem"].map("{:,.4f}".format)

summary_stat_2_df

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.3293
Ceftamin,52.59,51.78,39.29,6.27,0.4698
Infubinol,52.88,51.82,43.13,6.57,0.4922
Ketapril,55.24,53.7,68.55,8.28,0.6039
Naftisol,54.33,52.51,66.17,8.13,0.5965
Placebo,54.03,52.29,61.17,7.82,0.5813
Propriva,52.32,50.45,43.85,6.62,0.5443
Ramicane,40.22,40.67,23.49,4.85,0.321
Stelasyn,54.23,52.43,59.45,7.71,0.5731
Zoniferol,53.24,51.82,48.53,6.97,0.5164


## Bar and Pie Charts

In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
