# Pymaceuticals Inc.
---

### Analysis


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
import math

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_data_complete = pd.merge(study_results, mouse_metadata, how="left", on=["Mouse ID"])

# Display the data table for preview
mouse_data_complete.head()

In [None]:
# Checking the number of mice.
mice_count = len(mouse_data_complete["Mouse ID"].unique())
mice_count

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = mouse_data_complete[mouse_data_complete[["Mouse ID", "Timepoint"]].duplicated()==True]
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mouse_df = mouse_data_complete[(mouse_data_complete["Mouse ID"] == "g989")]
duplicate_mouse_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_data_complete = mouse_data_complete[mouse_data_complete['Mouse ID'] != "g989"]
mouse_data_complete.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
mice_count_cleaned = len(mouse_data_complete["Mouse ID"].unique())
mice_count_cleaned

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

mean_tumor_volume = mouse_data_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].mean()
median_tumor_volume = mouse_data_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].median()
tumor_volume_variance = mouse_data_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].var()
tumor_volume_stdev = mouse_data_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].std()
tumor_volume_stderr = mouse_data_complete.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].sem()

drug_statistics_df = pd.DataFrame({"Mean Tumor Volume":mean_tumor_volume,
                                   "Median Tumor Volume":median_tumor_volume,
                                   "Tumor Volume Variance":tumor_volume_variance,
                                   "Tumor Volume Std. Dev.":tumor_volume_stdev,
                                   "Tumor Volume Std. Err.":tumor_volume_stderr})
drug_statistics_df

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
drug_statistics_df = mouse_data_complete.groupby('Drug Regimen').aggregate({"Tumor Volume (mm3)": ['mean', 'median', 'var', 'std', 'sem']})
drug_statistics_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
mouse_timepoints = mouse_data_complete.groupby(["Drug Regimen"])["Timepoint"].count().sort_values(ascending=False)
timepoints_chart = mouse_timepoints.plot(kind="bar")
timepoints_chart.set_xlabel("Drug Regimen")
timepoints_chart.set_ylabel("# of Observed Mouse Timepoints")
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
timepoints = mouse_data_complete["Drug Regimen"].value_counts()
plt.bar(timepoints.index, timepoints.values)
plt.xticks(rotation="vertical")
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
male_female = mouse_data_complete["Sex"].value_counts()
male_female.plot(kind="pie", autopct="%1.1f%%", shadow=True)
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
male_female = mouse_data_complete["Sex"].value_counts()
labels = ["Male", "Female"]
sizes = male_female
plt.pie(sizes, labels=labels, autopct="%1.1f%%", shadow=True)
plt.ylabel("Sex")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
greatest_timepoint = mouse_data_complete.loc[mouse_data_complete["Timepoint"]==45]
greatest_timepoint

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
greatest_timepoint_merged = pd.merge(mouse_data_complete, greatest_timepoint)
greatest_timepoint_merged

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drug = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in drug:
    final_vol = greatest_timepoint_merged.loc[greatest_timepoint_merged["Drug Regimen"] == drug, "Tumor Volume (mm3)"]
    quartiles = final_vol.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq

    # add subset
    tumor_vol_data.append(final_vol)
   
    # Determine outliers using upper and lower bounds
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f"Values below {lower_bound} are outliers for {drug}.")
    print(f"Values above {upper_bound} are outliers for {drug}.")

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
plt.boxplot(tumor_vol_data)
plt.xticks([1, 2, 3, 4], ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])
plt.ylabel("Final Tumor Volume (mm3)")
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
single_capomulin = mouse_data_complete.loc[mouse_data_complete["Mouse ID"]=="l509"]
single_capomulin_df = single_capomulin[["Timepoint", "Tumor Volume (mm3)"]]
single_capomulin_df.plot(x="Timepoint", y="Tumor Volume (mm3)", kind="line")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin Treatment of Mouse l509")

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
capomulin = mouse_data_complete[(mouse_data_complete["Drug Regimen"] == "Capomulin")]
capomulin_weight = capomulin.groupby(["Mouse ID"])["Tumor Volume (mm3)"].mean()
capomulin_volume = capomulin.groupby(["Mouse ID"])["Weight (g)"].unique()
capomulin_df = pd.merge(capomulin_weight, capomulin_volume, on="Mouse ID")
capomulin_df["Weight (g)"] = capomulin_df["Weight (g)"].astype(float)
capomulin_df

plt.scatter(capomulin_df["Weight (g)"],
            capomulin_df["Tumor Volume (mm3)"])
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model for mouse weight and average observed tumor volume for the entire Capomulin regimen
capomulin = mouse_data_complete[(mouse_data_complete["Drug Regimen"] == "Capomulin")]
capomulin_weight = capomulin.groupby(["Mouse ID"])["Tumor Volume (mm3)"].mean()
capomulin_volume = capomulin.groupby(["Mouse ID"])["Weight (g)"].unique()
capomulin_df = pd.merge(capomulin_weight, capomulin_volume, on="Mouse ID")
capomulin_df["Weight (g)"] = capomulin_df["Weight (g)"].astype(float)

x_values = capomulin_df["Weight (g)"]
y_values = capomulin_df["Tumor Volume (mm3)"]
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
print(f"The correlation between mouse weight and average tumor volume is {round(st.pearsonr(x_values, y_values)[0],2)}")
plt.show()