## Observations and Insights 

In [84]:
# kmr Pymaceuticals Matplotlib Challenge 

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Look at datasets for understanding
#print(mouse_metadata)
#print(study_results)

# Combine the data into a single dataset
combined = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')
#combined.head(20)

print(len(combined))
# Display the data table for preview
combined.head(20)

1893


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [85]:
# Checking the number of mice.
mousecount = combined['Mouse ID'].unique()
print(len(mousecount))

249


In [86]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#combined_two = combined.groupby(['Mouse ID', 'Timepoint']).count()
#combined_two.head(20)
duphunter = combined.groupby(["Mouse ID"]).count()["Timepoint"]
duptable = pd.DataFrame({"Number of Timepoints": duphunter}).sort_values(["Number of Timepoints"], ascending=False)
duptable.head(5)


Unnamed: 0_level_0,Number of Timepoints
Mouse ID,Unnamed: 1_level_1
g989,13
a203,10
n364,10
m546,10
m550,10


In [87]:
# Optional: Get all the data for the duplicate mouse ID. 
mouseindex = combined.set_index("Mouse ID")
#mouseindex.head()
mouse989 = mouseindex.loc["g989"]
mouse989.head(20)

Unnamed: 0_level_0,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
g989,Propriva,Female,21,26,0,45.0,0
g989,Propriva,Female,21,26,0,45.0,0
g989,Propriva,Female,21,26,5,48.786801,0
g989,Propriva,Female,21,26,5,47.570392,0
g989,Propriva,Female,21,26,10,51.745156,0
g989,Propriva,Female,21,26,10,49.880528,0
g989,Propriva,Female,21,26,15,51.325852,1
g989,Propriva,Female,21,26,15,53.44202,0
g989,Propriva,Female,21,26,20,55.326122,1
g989,Propriva,Female,21,26,20,54.65765,1


In [88]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
#combinedtwo = combined.drop(["Mouse ID" != "g989"])
#combinedtwo.head()
#combined = combined[combined.Mouse ID !='g989']
#combined = combined(combined["Mouse ID" !="g989"])
deduped = mouseindex.drop(["g989"])
deduped.head()

Unnamed: 0_level_0,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
k403,Ramicane,Male,21,16,0,45.0,0
k403,Ramicane,Male,21,16,5,38.825898,0
k403,Ramicane,Male,21,16,10,35.014271,1
k403,Ramicane,Male,21,16,15,34.223992,1
k403,Ramicane,Male,21,16,20,32.997729,1


In [89]:
# Checking the number of mice in the clean DataFrame.
print(len(deduped))

#print(len(deduped).unique)
#print(len[deduped["Mouse ID"]])
#deduped.unique()
#print(deduped).unique()
#mousecounttwo = deduped["Mouse ID"].unique()
#mousecounttwo = deduped['Mouse ID'].unique()
#mousecounttwo = deduped(["Mouse ID"]).unique()
#mousecounttwo = deduped(['Mouse ID']).unique()
#mousecounttwo = deduped(deduped['Mouse ID']).unique()
#print(len(mousecounttwo))
#print(mousecounttwo)

1880


## Summary Statistics

In [98]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

mean_df = deduped.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"].rename("Tumor Mean")
#mean_df.head(11)
median_df = deduped.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"].rename("Tumor Median")
#median_df.head(11)
variance_df = deduped.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"].rename("Tumor Variance")
#variance_df.head(11)
deviation_df = deduped.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"].rename("Tumor Std Dev")
#deviation_df.head(11)
sem_df = deduped.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"].rename("Tumor Std Error")
#sem_df.head(11)

# Build a dataframe to present a consolidated table
new_df = pd.DataFrame({"Tumor Mean": mean_df, "Tumor Median": median_df,"Tumor Variance": variance_df,"Tumor Std Dev": deviation_df,"Tumor Std Error": sem_df}
new_df                          
                           
                           

# Format the dataframe for a more friendly presentation
#spend_df["Total Purchase Value"] = spend_df["Total Purchase Value"].map("${:.2f}".format)
#spend_df["Average Purchase Price"] = spend_df["Average Purchase Price"].map("${:.2f}".format)
#spend_df.head(12)

# Build dataframes for each desired metric and calculate the metrics
#price_df = purchasedata_df.groupby(["SN"]).sum()["Price"].rename("Total Purchase Value")
#item_df = purchasedata_df.groupby(["SN"]).count()["Price"].rename("Purchase Count")
#avg_df=purchasedata_df.groupby(["SN"]).mean()["Price"].rename("Average Purchase Value")


SyntaxError: invalid syntax (<ipython-input-98-1f4777f2de47>, line 20)

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
