## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study_merged = pd.merge(mouse_metadata, study_results, on="Mouse ID")
# Display the data table for preview
mouse_study_merged


In [None]:
# Checking the number of mice.
count = mouse_study_merged["Mouse ID"].value_counts()
count

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

# Extract duplicate rows 
duplicated = mouse_study_merged[mouse_study_merged.duplicated(subset=["Mouse ID", "Timepoint"], keep='last')]

duplicated


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

cleaned_mouse_study = mouse_study_merged.drop_duplicates(subset=["Mouse ID", "Timepoint"], keep='first')

cleaned_mouse_study

In [None]:
# Checking the number of mice in the clean DataFrame.
count = cleaned_mouse_study["Mouse ID"].value_counts()
count

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume
# for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 

grouped_study_df = cleaned_mouse_study.groupby(["Drug Regimen"])
print(grouped_study_df)
grouped_study_df.count().head(20)



In [None]:
# Calculate the Mean of Tumor Volumes for each Drug Regimen

mean_tumor_by_regimen_df = pd.DataFrame(
    grouped_study_df[["Tumor Volume (mm3)"]].mean()
   )
mean_tumor_by_regimen_df


In [None]:
#Calculate the Median of the Tumor Volumes per Drug Regimen

median_tumor_by_regimen_df = pd.DataFrame(
    grouped_study_df[["Tumor Volume (mm3)"]].median()
   )
median_tumor_by_regimen_df

# Declare a Dataframe to hold all the statistical values calculated

# Add the Mean and Median of Tumor Volumes to the Summary Dataframe
summary_df = pd.merge(mean_tumor_by_regimen_df, median_tumor_by_regimen_df, on='Drug Regimen', how='inner')

# Change the name of the columns to reflect what they stand for
summary_df = summary_df.rename(columns={'Tumor Volume (mm3)_x' : 'Mean of tumor volumes'})
summary_df = summary_df.rename(columns={'Tumor Volume (mm3)_y' : 'Median'})
summary_df

In [None]:
# Calculate the variance of Tumor Volumes

var_tumor_by_regimen_df = pd.DataFrame(
    grouped_study_df[["Tumor Volume (mm3)"]].var()
   )

#Add Variances to the summary table
summary_df = pd.merge(summary_df, var_tumor_by_regimen_df, on='Drug Regimen', how='inner')

summary_df = summary_df.rename(columns={'Tumor Volume (mm3)' : 'Variance'})

summary_df

In [None]:
# Calculate the Standard Deviation of Tumor Volumes
std_tumor_by_regimen_df = pd.DataFrame(
    grouped_study_df[["Tumor Volume (mm3)"]].std()
   )

#Add Standard Deviations to the summary table
summary_df = pd.merge(summary_df, std_tumor_by_regimen_df, on='Drug Regimen')
summary_df = summary_df.rename(columns={'Tumor Volume (mm3)' : 'Standard Deviation'})
summary_df

In [None]:
# Calculate the SEM of Tumor Volumes
sem_tumor_by_regimen_df = pd.DataFrame(
    grouped_study_df[["Tumor Volume (mm3)"]].sem()
   )

#Add SEM to the summary table
summary_df = pd.merge(summary_df, sem_tumor_by_regimen_df, on='Drug Regimen')
summary_df = summary_df.rename(columns={'Tumor Volume (mm3)' : 'Standard Error of Mean Vs Standard Deviation'})
summary_df

In [None]:
  # Using the aggregation method, produce the same summary statistics in a single line

summary2_df = cleaned_mouse_study.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean","median","var","std","sem"]})

# Change the table names
summary2_df = summary2_df.rename(columns={'mean' : 'Mean', 'median' : 'Median',
                                       'var': 'Variance', 'std':'Standard Deviation', 'sem':'SEM'})
summary2_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
Counts_df=grouped_study_df[['Timepoint', 'Mouse ID']].value_counts().reset_index(name='Total_timepoints_mice')
Counts_df

In [None]:
x_axis = np.arange(len(Counts_df))
tick_locations = [value for value in x_axis]

# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(10,6))
plt.bar(x_axis, Counts_df["Total_timepoints_mice"], color='r', alpha=0.5, align="center")
plt.xticks(tick_locations, Counts_df["Drug Regimen"], rotation="vertical")

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
