# Pymaceuticals Inc.

In [None]:
# Dependencies and Setup
# Dependencies
from matplotlib import pyplot as plt
from scipy.stats import linregress
import scipy.stats as st
import numpy as np
from sklearn import datasets
import pandas as pd

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
Pymaceuticals_df = pd.merge(study_results,mouse_metadata,on = 'Mouse ID')

# Display the data table for preview
Pymaceuticals_df.head()

In [None]:
# Checking the number of mice.
No_Mice = len(Pymaceuticals_df['Mouse ID'].unique())
print(f" Total number of Unique Mice: {No_Mice}")

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
Duplicates_df = Pymaceuticals_df[Pymaceuticals_df.duplicated(["Mouse ID","Timepoint"])]

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
Duplicates_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_Pymaceuticals_df = Pymaceuticals_df.drop(Pymaceuticals_df[Pymaceuticals_df['Mouse ID'] == 'g989'].index)

In [None]:
# Checking the number of mice in the clean DataFrame.
No_Mice = len(clean_Pymaceuticals_df['Mouse ID'].unique())
print(f" Total number of Unique Mice: {No_Mice}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.
grouped_Pymaceuticals_df = Pymaceuticals_df.groupby(['Drug Regimen'])
print(grouped_Pymaceuticals_df)

# Mean of the tumor volume for each regimen
tumorvol_mean = grouped_Pymaceuticals_df['Tumor Volume (mm3)'].mean()
# Median of the tumor volume for each regimen
tumorvol_median = grouped_Pymaceuticals_df['Tumor Volume (mm3)'].median()
# Variance of the tumor volume for each regimen
tumorvol_var = grouped_Pymaceuticals_df['Tumor Volume (mm3)'].var()
# Std DEV of the tumor volume for each regimen
tumorvol_stdev = grouped_Pymaceuticals_df['Tumor Volume (mm3)'].std()
# SEM of the tumor volume for each regimen
tumorvol_sem = grouped_Pymaceuticals_df['Tumor Volume (mm3)'].sem()

Summary = pd.merge(tumorvol_mean,tumorvol_median,on = 'Drug Regimen')
Summary1 = pd.merge(Summary,tumorvol_var,on = 'Drug Regimen')
Summary2 = pd.merge(Summary1,tumorvol_stdev,on = 'Drug Regimen')
Summary3 = pd.merge(Summary2,tumorvol_sem,on = 'Drug Regimen')
Summary3.reset_index(inplace=True)
Summary3.columns = ["Drug Regimen","Mean Tumor Volume","Median Tumor Volume","Tumor Volume Vairance","Tumor Volume Std. Dev.","Tumor Volume Std. Err."]
Summary3

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
Pymaceuticals_summary = pd.DataFrame(Pymaceuticals_df.groupby(['Drug Regimen']).agg({"Tumor Volume (mm3)":["mean","median","var","std","sem"]}))
Pymaceuticals_summary

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
# Filter the DataFrame down only to those columns to chart

Pymaceuticals_summary = pd.DataFrame(Pymaceuticals_df.groupby("Drug Regimen").agg({"Drug Regimen":["count"]}))
Pymaceuticals_summary.reset_index(inplace=True)
Pymaceuticals_summary.columns = ["Drug Regimen","Data_Points"]

Sorted_summary = Pymaceuticals_summary.sort_values(by = ["Data_Points"],ascending = False)
Sorted_summary.set_index("Drug Regimen",inplace = True)
# Use DataFrame.plot() in order to create a bar chart of the data
Sorted_summary.plot(kind="bar",legend = False)

# Set a title for the chart
plt.title("# Mice per Treatment")
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Data Points')

plt.tight_layout()
plt.show()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
Pymaceuticals_summary = pd.DataFrame(Pymaceuticals_df.groupby("Drug Regimen").agg({"Drug Regimen":["count"]}))
Pymaceuticals_summary.reset_index(inplace=True)
Pymaceuticals_summary.columns = ["Drug Regimen","Data_Points"]

Sorted_summary = Pymaceuticals_summary.sort_values(by = ["Data_Points"],ascending = False)
Sorted_summary.reset_index(inplace=True)

x_axis = np.arange(len(Sorted_summary["Drug Regimen"]))
tick_locations = [value for value in x_axis]

plt.bar(x_axis, Sorted_summary["Data_Points"], color='skyblue', alpha=1, align="center") 

plt.xticks(tick_locations, Sorted_summary["Drug Regimen"], rotation="vertical")

plt.title("# Mice per Treatment")
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Data Points')

plt.tight_layout() 
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
# Grouping the Dataframe by Gender

perc = round((Pymaceuticals_df["Sex"].value_counts()/len(Pymaceuticals_df.value_counts()))*100)

perc.plot(kind="pie",autopct="%1.1f%%")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

perc = round((Pymaceuticals_df["Sex"].value_counts()/len(Pymaceuticals_df.value_counts()))*100)

# Labels for the sections of our pie chart
labels = ["Male", "Female"]

# The values of each section of the pie chart
values = perc

# The colors of each section of the pie chart
colors = ["lightblue","darkorange"]

# Tells matplotlib to seperate the "Humans" section from the others
explode = (0.1, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(values, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=5)

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
Select_regimens = Pymaceuticals_df[(Pymaceuticals_df['Drug Regimen']=='Capomulin') | (Pymaceuticals_df['Drug Regimen']=='Ramicane')
                                  | (Pymaceuticals_df['Drug Regimen']=='Infubinol')| (Pymaceuticals_df['Drug Regimen']=='Ceftamin')]

regimens_summary = pd.DataFrame(Select_regimens.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["sum"]}))
regimens_summary.reset_index(inplace=True)
regimens_summary.columns = ["Drug Regimen","Total Tumor Volume (mm3)"]

# Start by getting the last (greatest) timepoint for each mouse
Last_timepoint = pd.DataFrame(Select_regimens.groupby("Drug Regimen").agg({"Timepoint":["max"]}))
Last_timepoint.reset_index(inplace=True)
Last_timepoint.columns = ["Drug Regimen","Timepoint"]

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
Group_df = pd.merge(regimens_summary,Last_timepoint,on = 'Drug Regimen')
Group_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
Treatments = Group_df['Drug Regimen']

# Create empty list to fill with tumor vol data (for plotting)
Tumor_Vol = []              

# Calculate the IQR and quantitatively determine if there are any potential outliers.
# Capomulin Regimen
Tumorvol_Capomulin = Pymaceuticals_df.loc[(Pymaceuticals_df['Drug Regimen']=='Capomulin'),'Tumor Volume (mm3)']
quartiles_Capomulin = Tumorvol_Capomulin.quantile([.25,.5,.75])
lowerq_Capomulin = quartiles_Capomulin[0.25]
upperq_Capomulin = quartiles_Capomulin[0.75]
iqr_Capomulin = upperq_Capomulin-lowerq_Capomulin
lower_bound_Capomulin = round(lowerq_Capomulin - (1.5*iqr_Capomulin),2)
upper_bound_Capomulin = round(upperq_Capomulin + (1.5*iqr_Capomulin),2)
#Ramicane Regimen 
Tumorvol_Ramicane = Pymaceuticals_df.loc[(Pymaceuticals_df['Drug Regimen']=='Ramicane'),'Tumor Volume (mm3)']
quartiles_Ramicane = Tumorvol_Ramicane.quantile([.25,.5,.75])
lowerq_Ramicane = quartiles_Ramicane[0.25]
upperq_Ramicane = quartiles_Ramicane[0.75]
iqr_Ramicane = upperq_Ramicane-lowerq_Ramicane
lower_bound_Ramicane = round(lowerq_Ramicane - (1.5*iqr_Ramicane),2)
upper_bound_Ramicane = round(upperq_Ramicane + (1.5*iqr_Ramicane),2)
#Infubinol Regimen 
Tumorvol_Infubinol = Pymaceuticals_df.loc[(Pymaceuticals_df['Drug Regimen']=='Infubinol'),'Tumor Volume (mm3)']
quartiles_Infubinol = Tumorvol_Infubinol.quantile([.25,.5,.75])
lowerq_Infubinol = quartiles_Infubinol[0.25]
upperq_Infubinol = quartiles_Infubinol[0.75]
iqr_Infubinol = upperq_Infubinol-lowerq_Infubinol
lower_bound_Infubinol = round(lowerq_Infubinol - (1.5*iqr_Infubinol),2)
upper_bound_Infubinol = round(upperq_Infubinol + (1.5*iqr_Infubinol),2)
#Ramicane Regimen 
Tumorvol_Ceftamin = Pymaceuticals_df.loc[(Pymaceuticals_df['Drug Regimen']== 'Ceftamin'),'Tumor Volume (mm3)']
quartiles_Ceftamin = Tumorvol_Ceftamin.quantile([.25,.5,.75])
lowerq_Ceftamin = quartiles_Ceftamin[0.25]
upperq_Ceftamin = quartiles_Ceftamin[0.75]
iqr_Ceftamin = upperq_Ceftamin-lowerq_Ceftamin
lower_bound_Ceftamin = round(lowerq_Ceftamin - (1.5*iqr_Ceftamin),2)
upper_bound_Ceftamin = round(upperq_Ceftamin + (1.5*iqr_Ceftamin),2)

print(f"Values below {lower_bound_Capomulin} and above {upper_bound_Capomulin} could be outliers for Capomulin Regimen.")
print(f"Values below {lower_bound_Ramicane} and above {upper_bound_Ramicane} could be outliers for Ramicane Regimen.")
print(f"Values below {lower_bound_Infubinol} and above {upper_bound_Infubinol} could be outliers for Infubinol Regimen.")
print(f"Values below {lower_bound_Ceftamin} and above {upper_bound_Ceftamin} could be outliers for Ceftamin Regimen.")      

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
Capomulin = Tumorvol_Capomulin
Ramicane = Tumorvol_Ramicane
Infubinol = Tumorvol_Infubinol
Ceftamin = Tumorvol_Ceftamin
plt.boxplot ([Capomulin, Ramicane, Infubinol,Ceftamin])

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Pymaceuticals_capomulin = Pymaceuticals_df.loc[(Pymaceuticals_df['Drug Regimen']=='Capomulin')]
Pymaceuticals_capomulinv2 = Pymaceuticals_capomulin.loc[(Pymaceuticals_df['Mouse ID']=='l509')]

x_axis = Pymaceuticals_capomulinv2['Timepoint']
y_axis = Pymaceuticals_capomulinv2['Tumor Volume (mm3)']
plt.plot(x_axis, y_axis)
plt.title('Capomulin treatment of mouse l509')
plt.xlabel('Timepoint(days)')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
Pymaceuticals_capomulin = Pymaceuticals_df.loc[(Pymaceuticals_df['Drug Regimen']=='Capomulin')]

Pymaceuticals_plt = pd.DataFrame(Pymaceuticals_df.groupby("Weight (g)").agg({"Tumor Volume (mm3)":["mean"]}))
Pymaceuticals_plt.reset_index(inplace=True)
Pymaceuticals_plt.columns = ["Weight (g)","Av Tumor Volume (mm3)"]

x_values = Pymaceuticals_plt['Weight (g)']
y_values = Pymaceuticals_plt['Av Tumor Volume (mm3)']
plt.scatter(x_values,y_values)
plt.xlabel('Weight(g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

x_values = Pymaceuticals_plt['Weight (g)']
y_values = Pymaceuticals_plt['Av Tumor Volume (mm3)']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)

regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,xy=(0.3,0.18),xycoords='figure fraction',horizontalalignment = 'left',fontsize = 12,color = "red")
plt.xlabel('Mouse Weight(g)')
plt.ylabel('Average Tumor Volume (mm3)')

correlation = st.pearsonr(x_values,y_values)
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}")
plt.show()