## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = r"Resources/Mouse_metadata.csv"
study_results_path = r"Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata, study_results, how='right', on='Mouse ID')

# Display the data table for preview
combined_df.head()

In [None]:
# Checking the number of mice.
len(combined_df["Mouse ID"].unique())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = combined_df[combined_df.duplicated(["Mouse ID", "Timepoint"])]
duplicate_mice.head()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_all = combined_df[combined_df.duplicated(subset=["Mouse ID", "Timepoint"])]
duplicate_all

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df.drop_duplicates(subset=['Mouse ID'])
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_mean = combined_df['Tumor Volume (mm3)'].mean()
print(f"The mean Tumor Volume is {tumor_mean}")
tumor_median = combined_df['Tumor Volume (mm3)'].median()
print(f"The median of Tumor Volume is {tumor_median}")
tumor_mode = combined_df['Tumor Volume (mm3)'].mode()
print(f"The mode of Tumor Volume is {tumor_mode}")
tumor_var = combined_df['Tumor Volume (mm3)'].var()
print(f"The variance of Tumor Volume is {tumor_var}")
tumor_std = combined_df['Tumor Volume (mm3)'].std()
print(f"The standard deviation of Tumor Volume is {tumor_std}")
tumor_sem = combined_df['Tumor Volume (mm3)'].sem()
print(f"The standard error mean of Tumor Volume is {tumor_sem}")

In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
combined_mean = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
combined_median = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
combined_var = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
combined_std = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
combined_sem = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

summary_df = pd.DataFrame({"Mean": combined_mean, "Median": combined_median, "Variance": combined_var, "Standard Deviation": combined_std, "SEM": combined_sem})
summary_df.head()

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
summary_agg_df = combined_df.groupby("Drug Regimen").agg(["mean", "median", "var", "std", "sem"])["Tumor Volume (mm3)"]
summary_agg_df = summary_agg_df.rename(columns={"mean":"Mean", "median": "Median", "var": "Variance", 
                                                "std": "Standard Deviation", "sem": "SEM"})
summary_agg_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
mice_drug = combined_df.groupby(['Drug Regimen']).count()['Mouse ID']

my_colors = ["#9A0EEA", "#FF796C", "#7E1E9C", "#580F41", "#C875C4", "#F97306", "#650021", "#C20078", "#C79FEF", "#380282"]

mice_drug.plot(kind="bar", figsize=(10,5), color=my_colors)

plt.title("Total Number of Timepoints for ALL Mice per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Count of Mice")

plt.show()
plt.tight_layout()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
mice_list =(combined_df.groupby(["Drug Regimen"])["Mouse ID"].count()).tolist()
mice_list

In [None]:
drug_regimens = summary_df.index.tolist()
drug_regimens

In [None]:
x_axis = np.arange(len(mice_list))
fig1, ax1 = plt.subplots(figsize=(10, 5))

plt.bar(x_axis, mice_list, color = my_colors, alpha = 0.8, align = 'center')

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drug_regimens)

plt.xlim(-0.75, len(x_axis)-0.25)
plt.ylim(0, max(mice_list)+10)

plt.title("Total Number of Timepoints for ALL Mice per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Count of Mice")

plt.show()
plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_df = pd.DataFrame(combined_df.groupby(["Sex"]).count()).reset_index()
gender_df.head()

In [None]:
gender_df = gender_df[["Sex","Mouse ID"]]
gender_df.head()

In [None]:
# Plot Pie Chart
gender_df.plot(kind='pie', y = "Mouse ID", labels=gender_df["Sex"], autopct='%1.1f%%', colors = my_colors, shadow=True, startangle=140)
plt.axis("equal")
plt.figure(figsize=(10,5))
plt.show()

In [None]:
gender_count = (combined_df.groupby(["Sex"])["Mouse ID"].count()).tolist()
gender_count

In [None]:
labels = ["Female", "Male"]
colors = ["yellow","green"]
explode = (0.1,0)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(gender_count, explode=explode, labels=labels, colors=my_colors, autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.figure(figsize=(10,5))

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
final_tumor = combined_df.loc[combined_df['Drug Regimen'].isin(['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])]

In [None]:
# Start by getting the last (greatest) timepoint for each mouse
greatest_timepoint = final_tumor.groupby(['Mouse ID'])['Timepoint'].max()

In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_df = pd.merge(greatest_timepoint, final_tumor, on=['Timepoint', 'Mouse ID'], how='left')
merged_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_list = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
quartiles = merged_df['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25]
print(f"Lower Quartile: {lowerq}.")
upperq = quartiles[0.75]
print(f"Upper Quartile: {upperq}.")
iqr = upperq-lowerq
print(f"IQR: {iqr}.")

In [None]:
# Locate the rows which contain mice on each drug and get the tumor volumes
capomulin_vol = merged_df.loc[merged_df['Drug Regimen'] == 'Capomulin']['Tumor Volume (mm3)']    
ramicane_vol = merged_df.loc[merged_df['Drug Regimen'] == 'Ramicane']['Tumor Volume (mm3)']
infubinol_vol = merged_df.loc[merged_df['Drug Regimen'] == 'Infubinol']['Tumor Volume (mm3)']
ceftamin_vol = merged_df.loc[merged_df['Drug Regimen'] == 'Ceftamin']['Tumor Volume (mm3)']

# add subset 
tumor_vol_data = [capomulin_vol, ramicane_vol, infubinol_vol, ceftamin_vol]    
    
# Determine outliers using upper and lower bounds
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Ouliers using lower bounds: {lower_bound}.")
print(f"Ouliers using upper bounds: {upper_bound}.")    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume of Each Mouse Across Four Regimens of Interest')
ax1.set_ylabel('Tumor Volume')
ax1.set_xlabel('Drug Regimen')
labels=["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

diamond = dict(markerfacecolor='r', marker='D')

ax1.boxplot(tumor_vol_data, flierprops=diamond, labels=labels)

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
cap_vol = combined_df.loc[combined_df['Mouse ID'] == 'l509']['Tumor Volume (mm3)'] 
cap_time = combined_df.loc[combined_df['Mouse ID'] == 'l509']['Timepoint']

plt.plot(cap_time, cap_vol, color = 'c') 
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume')

plt.tight_layout()
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
cap_df = combined_df.loc[combined_df['Drug Regimen'].isin(['Capomulin'])]

data_df = cap_df.loc[:, ["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]

avg_tumor_vol = data_df.groupby(data_df["Mouse ID"])["Tumor Volume (mm3)"].mean()

mouse_weight = data_df.groupby(data_df["Mouse ID"])["Weight (g)"].mean()

plt.scatter(mouse_weight,avg_tumor_vol, marker="o", facecolors="red", edgecolors="black", alpha=0.75)
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.tight_layout()
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
corr = round(st.pearsonr(mouse_weight, avg_tumor_vol)[0],2)
print(f"The correlation coefficient for mouse weight and average tumor volume for the Capomulin regimen is {corr}.")

In [None]:
# Perform a linear regression model on average tumor volume vs. mouse weight for the Capomulin regimen
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(mouse_weight, avg_tumor_vol)
# Get regression values
regress_values = mouse_weight + slope + intercept

# PLot regression model
plt.scatter(mouse_weight, avg_tumor_vol)
plt.plot(mouse_weight, regress_values, marker="x", color="r")
plt.xticks(mouse_weight, rotation = 45)
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumor Volume (mmm3)')

plt.tight_layout()
plt.show()