## Observations and Insights 

In [107]:
# Dependencies and Setup
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


# Combine the data into a single dataset
Combined_Results = mouse_metadata.merge(study_results, on="Mouse ID")

# Display the data table for preview
Combined_Results.head(20)



Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [94]:
# Checking the number of mice.
len(Combined_Results["Mouse ID"].unique())



249

In [95]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

Duplicate_Mouse = Combined_Results.loc[Combined_Results.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
Duplicate_Mouse


array(['g989'], dtype=object)

In [96]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
Cleaned_df = Combined_Results[Combined_Results["Mouse ID"].isin(Duplicate_Mouse)==False]
Cleaned_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [97]:
# Checking the number of mice in the clean DataFrame.
Check = len(Cleaned_df["Mouse ID"].unique())
Check

248

## Summary Statistics

In [98]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
by_drug_mean = Cleaned_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
by_drug_mean

by_drug_median = Cleaned_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
by_drug_median

by_drug_var = Cleaned_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
by_drug_var

by_drug_sd = Cleaned_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
by_drug_sd

by_drug_SEM = Cleaned_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]
by_drug_SEM

Summary_Statistics_Drugs = pd.DataFrame({
    "Mean": by_drug_mean,
    "Median": by_drug_median,
    "Variance": by_drug_var,
    "StandardDev": by_drug_sd,
    "SEM": by_drug_SEM    
})
Summary_Statistics_Drugs


Unnamed: 0_level_0,Mean,Median,Variance,StandardDev,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [99]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [100]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

mice_per_treatment = Cleaned_df[["Drug Regimen", "Mouse ID"]]
mice_per_treament_count = mice_per_treatment.groupby("Drug Regimen").count()["Mouse ID"]
mice_per_treament_count


Mice_per_treatment_df = pd.DataFrame({
    "Mouse ID" : mice_per_treament_count
})

Mice_per_treatment_df.plot(kind="bar", figsize=(8,5))

plt.title("Mice Per Treatment")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [101]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(len(Mice_per_treatment_df))
tick_locations = [value for value in x_axis]
plt.figure(figsize=(8, 5))
plt.bar(x_axis, Mice_per_treatment_df["Mouse ID"], color='blue', alpha=0.5, align="center")
plt.xticks(tick_locations, Cleaned_df["Drug Regimen"], rotation="vertical")
plt.tight_layout()
plt.show()


<IPython.core.display.Javascript object>

In [104]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

Mice_Gender = Cleaned_df[["Mouse ID", "Sex"]]
Mice_Gender_count = Mice_Gender.groupby("Sex").count()["Mouse ID"]
Mice_Gender_count_df = pd.DataFrame({
    "Sex_Count": Mice_Gender_count
})

Mice_Gender_count_df.plot(kind = "pie", subplots = True, figsize = (5,5))
plt.show()


<IPython.core.display.Javascript object>

In [106]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

labels = ["Male", "Female"]
number_of_each_gender = Mice_Gender_count_df["Sex_Count"]
colors = ["Orange", "Blue"]
plt.pie(number_of_each_gender, explode=None, labels=labels, colors=colors, autopct="%1.0f%%", shadow=True, startangle=140)
plt.show

<function matplotlib.pyplot.show(*args, **kw)>

## Quartiles, Outliers and Boxplots

In [121]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
Tumor_Volume = Cleaned_df[["Drug Regimen", "Mouse ID","Timepoint", "Tumor Volume (mm3)"]]
Tumor_Volume
#Tumor_Volume_max = Tumor_Volume.groupby("Drug Regimen").max()["Timepoint"]
#Tumor_Volume_max
#Cleaned_df.groupby("Drug Regimen").max()("Timepoint")

#mice_per_treatment = Cleaned_df[["Drug Regimen", "Mouse ID"]]
#mice_per_treament_count = mice_per_treatment.groupby("Drug Regimen").count()["Mouse ID"]
#mice_per_treament_count
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


Unnamed: 0,Drug Regimen,Mouse ID,Timepoint,Tumor Volume (mm3)
0,Ramicane,k403,0,45.000000
1,Ramicane,k403,5,38.825898
2,Ramicane,k403,10,35.014271
3,Ramicane,k403,15,34.223992
4,Ramicane,k403,20,32.997729
...,...,...,...,...
1888,Naftisol,z969,25,63.145652
1889,Naftisol,z969,30,65.841013
1890,Naftisol,z969,35,69.176246
1891,Naftisol,z969,40,70.314904


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
