## Observations and Insights 

In [24]:
%matplotlib inline

In [39]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_mice_df = pd.merge(study_results, mouse_metadata, how="outer", on="Mouse ID")


In [26]:
mice_sorted_df = combined_mice_df.sort_values(by=["Mouse ID", "Timepoint"])
mice_sorted_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
898,a203,0,45.000000,0,Infubinol,Female,20,23
899,a203,5,48.508468,0,Infubinol,Female,20,23
900,a203,10,51.852437,1,Infubinol,Female,20,23
901,a203,15,52.777870,1,Infubinol,Female,20,23
902,a203,20,55.173336,1,Infubinol,Female,20,23
...,...,...,...,...,...,...,...,...
105,z969,25,63.145652,2,Naftisol,Male,9,30
106,z969,30,65.841013,3,Naftisol,Male,9,30
107,z969,35,69.176246,4,Naftisol,Male,9,30
108,z969,40,70.314904,4,Naftisol,Male,9,30


In [27]:
# Checking the number of mice in the DataFrame.
number_of_mice = len(mice_sorted_df["Mouse ID"].unique())
number_of_mice

249

In [28]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
de_duped_mice_df = mice_sorted_df.drop_duplicates("Mouse ID", "last")
de_duped_mice_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
907,a203,45,67.973419,2,Infubinol,Female,20,23
289,a251,45,65.525743,1,Infubinol,Female,21,25
1134,a262,45,70.717621,4,Placebo,Female,17,29
1487,a275,45,62.999356,3,Ceftamin,Female,20,28
736,a366,30,63.440686,1,Stelasyn,Female,16,29
...,...,...,...,...,...,...,...,...
1566,z435,10,48.710661,0,Propriva,Female,12,26
646,z578,45,30.638696,0,Ramicane,Male,11,16
279,z581,45,62.754451,3,Infubinol,Female,24,25
683,z795,45,65.741070,3,Naftisol,Female,13,29


In [29]:
# Checking the number of mice in the clean DataFrame.
assert (de_duped_mice_df["Mouse ID"].count()) == number_of_mice

In [52]:
mice_sorted_df["Drug Regimen"].unique()

array(['Infubinol', 'Placebo', 'Ceftamin', 'Stelasyn', 'Zoniferol',
       'Ramicane', 'Ketapril', 'Propriva', 'Naftisol', 'Capomulin'],
      dtype=object)

## Summary Statistics

Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen


In [72]:
# find mean of tumor volume grouped by drug regimen and creating series for tumor volume

drug_regimen_group = mice_sorted_df.groupby(by="Drug Regimen")
tumor_series_group = drug_regimen_group["Tumor Volume (mm3)"]

In [73]:
tumor_mean = tumor_series_group.mean()

In [74]:
tumor_median = tumor_series_group.median()

In [75]:
tumor_std = tumor_series_group.std()

In [76]:
tumor_variance = tumor_series_group.var()

In [77]:
tumor_sem = tumor_series_group.sem()

In [78]:
# creating summary table
summary_df = pd.DataFrame(data={"Mean":tumor_mean, "Median":tumor_median, "Standard Deviation":tumor_std, "Variance":tumor_variance, "SEM":tumor_sem})
summary_df

Unnamed: 0_level_0,Mean,Median,Standard Deviation,Variance,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,4.994774,24.947764,0.329346
Ceftamin,52.591172,51.776157,6.268188,39.290177,0.469821
Infubinol,52.884795,51.820584,6.567243,43.128684,0.492236
Ketapril,55.235638,53.698743,8.279709,68.553577,0.60386
Naftisol,54.331565,52.509285,8.134708,66.173479,0.596466
Placebo,54.033581,52.288934,7.821003,61.168083,0.581331
Propriva,52.322552,50.854632,6.50777,42.35107,0.512884
Ramicane,40.216745,40.673236,4.846308,23.486704,0.320955
Stelasyn,54.233149,52.431737,7.710419,59.450562,0.573111
Zoniferol,53.236507,51.818479,6.966589,48.533355,0.516398


## Bar Plots

In [9]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
