In [None]:
# Pymaceuticals Inc.
---

### Analysis

- This study has been conducted with a close 1:1 ration of male and female mice population. It is important to highlight that there were slightly more observations for Capomulin and Ramicane than the other treatments.
- Capomulin & Ramicane both presented lower tumor volume at the end of observation period when compared to the other treatments. Between the two, does not seem to have statistical difference , but further analysis should be done to confirm .
- Looking at the Capomulin treatment cohort, it is possible to see a linear relationship between tumor size and times of observation and a strong correlation between the weight of the mouse and the average tumor size for the time of observation. 


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "Data/Mouse_metadata.csv"
study_results_path = "Data/Study_results.csv"


# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

study_results.head()
mouse_metadata.head()

# Combine the data into a single DataFrame
mouse_study_df = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
mouse_study_df.head()

In [None]:
# Checking the number of mice.
print (mouse_study_df['Mouse ID'].nunique())

In [None]:
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicates_find = mouse_study_df[mouse_study_df.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)]
print(duplicates_find)

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
def filter_rows_by_values(df, column, values):
    return df[~df[column].isin(values)]

mouse_study_clean = filter_rows_by_values(mouse_study_df, "Mouse ID", ["g989"])
mouse_study_clean

In [None]:
# Checking the number of mice in the clean DataFrame.
print (mouse_study_clean['Mouse ID'].nunique())


In [None]:

Summary Statistics

In [None]:
stats_sum = mouse_study_clean[["Tumor Volume (mm3)", "Drug Regimen"]].groupby("Drug Regimen").agg({
    "Tumor Volume (mm3)": ["mean", "median", "var", "std", "sem"]
})
stats_sum
stats_sum.to_csv('output.txt', sep='\t')

In [None]:
Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
df_barplot = mouse_study_clean[["Drug Regimen", "Timepoint"]].groupby("Drug Regimen").count()
df_barplot_sorted = df_barplot.sort_values("Timepoint", ascending=False)

df_barplot_sorted.plot.bar(legend=False)
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.title("Observed Mouse Timepoints by Drug Regimen")
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
df_barplot = mouse_study_clean[["Drug Regimen", "Timepoint"]].groupby("Drug Regimen").count()
df_barplot_sorted = df_barplot.sort_values("Timepoint", ascending=False)

df_barplot_sorted.plot(kind="bar", legend=False)
plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.title("Observed Mouse Timepoints by Drug Regimen")
plt.tight_layout()

plt.show()

#save plot as image
plt.savefig("observations_drugregimen.png")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
# --- dataset 1: just 4 values for 4 groups:
sex_distribution_df = mouse_study_clean["Sex"].value_counts()
sex_distribution_df = sex_distribution_df.reindex(["Male", "Female"]) 
# make the plot
sex_distribution_df.plot(kind='pie', subplots=True, figsize=(6, 6),autopct="%1.1f%%", title="Male and Female Distribution")

# show the plot
plt.axis("equal")
plt.show()

#save plot as image
plt.savefig("male_female_distribution.png")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex_distribution_df = mouse_study_clean["Sex"].value_counts()
sex_distribution_df = sex_distribution_df.reindex(["Male", "Female"])

# Make the plot
plt.pie(sex_distribution_df, labels=sex_distribution_df.index, autopct="%1.1f%%")
plt.ylabel("Sex")
plt.title("Male and Female Distribution")
plt.axis("equal")


# Show the plot
plt.show()

In [None]:
Quartiles, Outliers and Boxplots¶

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
maxtime_treat_mouse = mouse_study_clean.groupby(["Mouse ID"])["Timepoint"].max()
maxtime_treat_mouse = maxtime_treat_mouse.reset_index()
maxtime_treat_mouse


In [None]:
maxtime_treat_tumor = pd.merge(maxtime_treat_mouse, mouse_study_clean, how="left", on=["Mouse ID","Timepoint"])
maxtime_treat_tumor

In [None]:
#check how many observsastions
maxtime_treat_tumor['Mouse ID'].nunique()

In [None]:
# Create empty list to fill with tumor vol data (for plotting)
treatments_list = ["Capomulin", "Ramicane", "Infubinol","Ceftamin"]
treatments_list


In [None]:
tumor_vol_data_list = []

for treatment in treatments_list:
    tumor_vol_data = maxtime_treat_tumor[maxtime_treat_tumor["Drug Regimen"] == treatment]["Tumor Volume (mm3)"]
    tumor_vol_data_list.append(tumor_vol_data)

    
tumor_vol_data_list

In [None]:
# Create the boxplot
fig, ax1 = plt.subplots()
ax1.set_title("Max Tumor Vol by treatment")
ax1.set_ylabel('Tumor Volume mm3')
ax1.boxplot(tumor_vol_data_list)
ax1.set_xticklabels(treatments_list)
plt.show()
plt.savefig("TumorVolumebyTreatment.png")

In [None]:
Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
mouse_data = mouse_study_clean.loc[mouse_study_clean["Mouse ID"] == "s185", ["Timepoint", "Tumor Volume (mm3)"]]
mouse_data

time = mouse_data["Timepoint"]
tumor_value = mouse_data["Tumor Volume (mm3)"]

x_axis = time
y_axis = tumor_value

plt.plot(x_axis, y_axis)
# Show the graph that we have created
plt.xlabel("timepoint")
plt.ylabel("Tumor volume")
plt.title("tumor volume vs. time point for Mouse s185 in Capomulin treatment")
plt.show()
plt.savefig("tumorvolumebytimeinCapomulinmouse.png")

In [None]:
Scatter plot , Correlation and Regression

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
capomulin_mice = mouse_study_clean[mouse_study_clean["Drug Regimen"] == "Capomulin"]
average_tumor_vol = capomulin_mice.groupby("Weight (g)")["Tumor Volume (mm3)"].mean()

x_values = average_tumor_vol.index
y_values = average_tumor_vol.values
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values, y_values, marker="o", facecolors="blue", edgecolors="blue", alpha=0.75)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(5.8,0.8),fontsize=15,color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title("Average Tumor Volume vs. Weight for Capomulin Mice")
print(f"The r-squared is: {rvalue**2}")
plt.show()
plt.savefig("Avgtumor_weight_regression.png")
correlation = st.pearsonr(x_values,y_values)
print(f"The correlation between both factors is {round(correlation[0],2)}")