## Observations and Insights
The study had a near identical number of male and female mice. Determining if sex of the mouse was correlated to efficacy would be interesting to look at.
Capomulin was effective in reducing the average SCC tumor volume in mice over a 45 day period. It'd be interested to continue the study past 45 days to see if the tumor would be completely ablated.
The weight of the mouse correlated strongly (R-squared of 0.84) with average tumor volume. Correlating only the final tumor volume to efficacy without controlling for mouse weight could skew the interpretation of efficacy.


In [7]:
# Import Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

In [8]:
# Study data files
mouse_metadata = "Data/Mouse_metadata.csv"
study_results = "Data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID",how="left")
merged_df.head()


FileNotFoundError: [Errno 2] File b'Data/Mouse_metadata.csv' does not exist: b'Data/Mouse_metadata.csv'

## Dependencies and starter code

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
#and SEM of the tumor volume for each regimen
#tabletu = merged_df.groupby('Drug Regimen')
#first = tabletu.agg(['Mean Tumor Value','Median Tumor Volume','Tumor Volume Variance','Tumor Volume Std. Dev','Tumor Volume Std. Err'])["Tumor Volume (mm3)"]
#first
tabletu = merged_df.groupby('Drug Regimen')
first = tabletu.agg(['mean','median','var','std','sem'])["Tumor Volume (mm3)"]

tabletu = merged_df.rename(columns={"mean":"Mean Tumor Value", "median":"Median Tumor Volume", "var":"Tumor Volume Variance","std":"Tumor Volume Std. Dev", "sem":"Tumor Volume Std. Err"}) ["Tumor Volume (mm3)"]
first

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

tabletu = merged_df.groupby('Drug Regimen').count()['Tumor Volume (mm3)']
first = pd.DataFrame(tabletu)

also = first.plot.bar(legend=False,rot=50)
also
plt.ylabel("Number of Data Points")
plt.title("Data Points Per Drug Treatment Regimen")
plt.savefig('barplot1')

In [None]:
first.head()

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

x_axis = np.arange(len(tabletu))

tick_locations = [x for x in x_axis]
#for x in x_axis:
#tick_locations.append(x)

plt.figure(figsize=(5,3))
#plt.bar(x_axis, rain_df["Inches"], color='r', alpha=0.5, align="center")
#plt.xticks(tick_locations, rain_df["State"], rotation="vertical")

newtry = first.reset_index()
newtry

plt.bar(x_axis, first['Tumor Volume (mm3)'], alpha=0.75, align="center")
plt.xticks(tick_locations, newtry['Drug Regimen'],rotation="vertical")

plt.xlim(-0.75, len(tabletu)-.25)
plt.ylim(0, 250)

plt.title("Data Points Per Drug Treatment Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")

plt.savefig('barplot2')
plt.show()


## Pie plots

In [None]:
#tabletu = merged_df.groupby('Drug Regimen').count()

bygender = mouse_metadata.groupby("Sex").count()

#bygender = mouse_metadata["Sex"].value_count()
bygender

In [None]:
morf = [mouse_metadata['Sex'].unique()]
mlabel = list(bygender.index)
mlabel

In [None]:
amount = [bygender["Mouse ID"]]
amount

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
# The colors of each section of the pie chart
colors = ["orange", "blue"]

plt.pie(amount, labels=newlist, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=180)
plt.title('Male vs Female Mouse Population')
plt.ylabel('Sex')

plt.savefig('piechart1')
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
pypie = bygender["Mouse ID"].plot.pie(y='sizes', figsize=(5, 5), autopct="%1.1f%%",startangle=180)
plt.title('Male vs Female Mouse Population')
plt.ylabel('Sex')

plt.savefig('piechart2')
plt.show()

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
cap_df = merged_df.loc[merged_df["Drug Regimen"] == "Capomulin",:]
ram_df = merged_df.loc[merged_df["Drug Regimen"] == "Ramicane", :]
inf_df = merged_df.loc[merged_df["Drug Regimen"] == "Infubinol", :]
ceft_df = merged_df.loc[merged_df["Drug Regimen"] == "Ceftamin", :]
cap_df.head()

In [None]:
caplast = cap_df.groupby('Mouse ID').max()['Timepoint']
caplastvol = pd.DataFrame(caplast)
caplastmerge = pd.merge(caplastvol, merged_df, on=("Mouse ID","Timepoint"),how="left")
caplastmerge.head(5)

In [None]:
tumors = caplastmerge["Tumor Volume (mm3)"]

quartiles = tumors.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Capomulin potential outliers could be values below {lower_bound} and above {upper_bound} could be outliers.")

In [None]:
ramlast = ram_df.groupby('Mouse ID').max()['Timepoint']
ramlastvol = pd.DataFrame(ramlast)
ramlastmerge = pd.merge(ramlastvol, merged_df, on=("Mouse ID","Timepoint"),how="left")
ramlastmerge.head()

In [None]:
#Ramicane
tumors2 = ramlastmerge["Tumor Volume (mm3)"]

quartiles2 = tumors2.quantile([.25,.5,.75])
lowerq2 = quartiles2[0.25]
upperq2 = quartiles2[0.75]
iqr2 = upperq2-lowerq2

lower_bound2 = lowerq2 - (1.5*iqr2)
upper_bound2 = upperq2 + (1.5*iqr2)
print(f"Ramicane potential outliers could be values below {lower_bound2} and above {upper_bound2} could be outliers.")


In [None]:
inflast = inf_df.groupby('Mouse ID').max()['Timepoint']
inflastvol = pd.DataFrame(inflast)
inflastmerge = pd.merge(inflastvol, merged_df, on=("Mouse ID","Timepoint"),how="left")
inflastmerge.head()

In [None]:
#Infubinol
tumors3 = inflastmerge["Tumor Volume (mm3)"]

quartiles3 = tumors3.quantile([.25,.5,.75])
lowerq3 = quartiles3[0.25]
upperq3 = quartiles3[0.75]
iqr3 = upperq3-lowerq3

lower_bound3 = lowerq3 - (1.5*iqr3)
upper_bound3 = upperq3 + (1.5*iqr3)
print(f"Infubinol potential outliers could be values below {lower_bound3} and above {upper_bound3} could be outliers.")

In [None]:
ceftlast = ceft_df.groupby('Mouse ID').max()['Timepoint']
ceftlastvol = pd.DataFrame(ceftlast)
ceftlastmerge = pd.merge(ceftlastvol, merged_df, on=("Mouse ID","Timepoint"),how="left")
ceftlastmerge.head()

In [None]:
#Ceftamin
tumors4 = ceftlastmerge["Tumor Volume (mm3)"]

quartiles4 = tumors4.quantile([.25,.5,.75])
lowerq4 = quartiles4[0.25]
upperq4 = quartiles4[0.75]
iqr4 = upperq4-lowerq4

lower_bound4 = lowerq4 - (1.5*iqr4)
upper_bound4 = upperq4 + (1.5*iqr4)
print(f"Ceftamin potential outliers could be values below {lower_bound4} and above {upper_bound4} could be outliers.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
# The second example again looks at the LAX temperature data set and computes quantiles

data_to_plot = [tumors, tumors2, tumors3, tumors4]

fig1, ax1 = plt.subplots()
ax1.set_title('Tumors')
ax1.set_ylabel('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')

ax1.boxplot(data_to_plot, labels=["Capomulin","Ramicane","Infubinol","Ceftamin",])

plt.savefig('boxplot')
plt.show()

## Line and scatter plots

In [None]:
forline_df = cap_df.loc[cap_df["Mouse ID"] == "l509",:]
forline_df.head()

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

x_axisTP = forline_df["Timepoint"]
tumsiz = forline_df["Tumor Volume (mm3)"]

plt.title('Capomulin treatmeant of mouse l509')
plt.plot(x_axisTP, tumsiz,linewidth=2, markersize=12)
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')

plt.savefig('linechart')
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capavg = cap_df.groupby(['Mouse ID']).mean()
plt.scatter(capavg['Weight (g)'],capavg['Tumor Volume (mm3)'])
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

plt.savefig('scatterplot')
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(cap_mouse_id['Mouse_weight'],cap_mouse_id['Tumor_vol_mean'])
print(f"The correlation between both factors is {round(correlation[0],2)}")

In [None]:
 Print out the r-squared value along with the plot.
x_values = cap_mouse_id['Mouse_weight']
y_values = cap_mouse_id['Tumor_vol_mean']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
#line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
line_eq = f'y = {str(round(slope,2))}x + {str(round(intercept,2))}'
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(17,37),fontsize=15,color="black")
plt.title("Mouse weight vs. Avg. Tumor Volume")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")
print(f"The r-squared is: {rvalue}")
print(f"The equation of the regression line is: {line_eq}")

# Save the figure
plt.savefig("figures/ScatterWeightTumorVolRegression.png")

plt.show()