## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouseandstudy_df = pd.merge(mouse_metadata, study_results, how="outer", on="Mouse ID")

# Display the data table for preview
mouseandstudy_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [2]:
# Checking for missing values. There do not seem to be any in this data set
mouseandstudy_df.count()

Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [3]:
# Checking the number of mice.
num_mice = len(mouseandstudy_df["Mouse ID"].unique())
num_mice

#is this supposed to be displaying the count before removing the duplicates or actual unique number of mice?

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = mouseandstudy_df[["Mouse ID", "Timepoint"]]
duplicate_mice_sorted = duplicate_mice.sort_values("Mouse ID")
duplicate_mice_sorted = duplicate_mice_sorted.reset_index(drop=True)
duplicate_mice_sorted

# ?????? not sure exactly what they are looking for here

Unnamed: 0,Mouse ID,Timepoint
0,a203,30
1,a203,35
2,a203,25
3,a203,20
4,a203,15
...,...,...
1888,z969,20
1889,z969,25
1890,z969,30
1891,z969,35


In [5]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_all_data = mouseandstudy_df.sort_values("Mouse ID")
duplicate_mice_all_data = duplicate_mice_all_data.reset_index(drop=True)
duplicate_mice_all_data

# ?????? not really sure about this either

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,30,59.523197,1
1,a203,Infubinol,Female,20,23,35,61.931650,2
2,a203,Infubinol,Female,20,23,25,56.793208,1
3,a203,Infubinol,Female,20,23,20,55.173336,1
4,a203,Infubinol,Female,20,23,15,52.777870,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,20,57.898778,2
1889,z969,Naftisol,Male,9,30,25,63.145652,2
1890,z969,Naftisol,Male,9,30,30,65.841013,3
1891,z969,Naftisol,Male,9,30,35,69.176246,4


In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
no_duplicates = duplicate_mice_all_data.drop_duplicates('Mouse ID')
new_data = no_duplicates.reset_index(drop=True)
new_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,a203,Infubinol,Female,20,23,30,59.523197,1
1,a251,Infubinol,Female,21,25,20,54.462594,1
2,a262,Placebo,Female,17,29,15,53.827974,2
3,a275,Ceftamin,Female,20,28,30,54.444713,2
4,a366,Stelasyn,Female,16,29,20,57.285987,0
...,...,...,...,...,...,...,...,...
244,z435,Propriva,Female,12,26,0,45.000000,0
245,z578,Ramicane,Male,11,16,0,45.000000,0
246,z581,Infubinol,Female,24,25,45,62.754451,3
247,z795,Naftisol,Female,13,29,30,59.789636,2


In [7]:
# Checking the number of mice in the clean DataFrame.
num_of_mice_nodups = len(new_data["Mouse ID"].unique())
num_of_mice_nodups


249

## Summary Statistics

In [47]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor 
# volume for each regimen
summary_stats = new_data.groupby("Drug Regimen")
data_mean = summary_stats["Tumor Volume (mm3)"].mean()
data_median = summary_stats["Tumor Volume (mm3)"].median()
data_variance = data_variance = summary_stats["Tumor Volume (mm3)"].var()
data_std = summary_stats["Tumor Volume (mm3)"].std()
data_sem = summary_stats["Tumor Volume (mm3)"].sem()

summary_stats_all = pd.DataFrame({
                                    "Mean":data_mean,
                                    "Median":data_median,
                                    "Variance":data_variance,
                                    "Standard Deviation":data_std,
                                    "SEM":data_sem
                                 
                                 })
summary_stats_all

# This method is the most straighforward, creating multiple series and putting them all together at the end.

# Is this right? Also, can I add "Tumor Volume (mm3)" for labeling purposes

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,41.341391,44.708055,25.748746,5.074322,1.014864
Ceftamin,50.773651,47.242929,43.115483,6.566238,1.313248
Infubinol,53.954813,54.462594,85.828896,9.264389,1.852878
Ketapril,52.656434,49.762415,80.220942,8.956614,1.791323
Naftisol,54.037593,48.78656,100.729904,10.036429,2.007286
Placebo,51.798419,47.459053,79.077368,8.892546,1.778509
Propriva,49.42154,49.145709,23.945895,4.893454,0.978691
Ramicane,41.045463,44.51256,31.746561,5.634409,1.126882
Stelasyn,52.371257,48.459299,76.671957,8.756252,1.787363
Zoniferol,51.709856,47.894441,59.290092,7.700006,1.540001


In [51]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the 
# tumor volume for each regimen

# This method produces everything in a single groupby function
summary_data_grouped = new_data.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean","median","var","std","sem"]})
summary_data_grouped = summary_data_grouped.rename(columns={"mean":"Mean","median":"Median","var":"Variance","std":"Standard Deviation","sem":"SEM"})
summary_data_grouped

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,41.341391,44.708055,25.748746,5.074322,1.014864
Ceftamin,50.773651,47.242929,43.115483,6.566238,1.313248
Infubinol,53.954813,54.462594,85.828896,9.264389,1.852878
Ketapril,52.656434,49.762415,80.220942,8.956614,1.791323
Naftisol,54.037593,48.78656,100.729904,10.036429,2.007286
Placebo,51.798419,47.459053,79.077368,8.892546,1.778509
Propriva,49.42154,49.145709,23.945895,4.893454,0.978691
Ramicane,41.045463,44.51256,31.746561,5.634409,1.126882
Stelasyn,52.371257,48.459299,76.671957,8.756252,1.787363
Zoniferol,51.709856,47.894441,59.290092,7.700006,1.540001


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study
# using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study 
# using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
