## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
study_results
# Combine the data into a single dataset
combined_df= mouse_metadata.merge(study_results, on='Mouse ID', how='inner')

# Display the data table for preview
combined_df.sample(10)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
1394,q132,Infubinol,Female,1,30,15,49.159749,3
188,j989,Ramicane,Male,8,19,15,41.128354,1
1006,j755,Naftisol,Male,23,27,10,48.167061,0
1166,m133,Naftisol,Female,2,26,10,47.503338,0
1258,o287,Ceftamin,Male,2,28,35,55.11829,4
454,a203,Infubinol,Female,20,23,0,45.0,0
1544,t451,Stelasyn,Male,8,29,15,49.652788,0
1655,v991,Propriva,Female,10,30,30,62.743643,3
1176,m269,Stelasyn,Female,22,28,30,67.71251,1
212,a520,Ramicane,Male,13,21,35,37.62471,1


In [2]:
# Checking the number of mice.
num_cases = len(combined_df)
print(f'Number of Cases = {num_cases}')

num_mice = len(combined_df['Mouse ID'].unique())
print(f'Number of Mice = {num_mice}')

Number of Cases = 1893
Number of Mice = 249


In [111]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duped_mice_id = combined_df[combined_df.duplicated(
                                                subset=['Mouse ID','Timepoint'], 
                                                keep=False)]['Mouse ID'].unique()

cleaned_combined_df = combined_df[~combined_df['Mouse ID'].isin(duped_mice_id)]


print(f'There are {len(cleaned_combined_df["Mouse ID"].unique())} Mouse IDs with non-duped Timepoints')


There are 248 Mouse IDs with non-duped Timepoints


In [77]:
# Optional: Get all the data for the duplicate mouse ID. 
unclean_df = combined_df[combined_df.duplicated(subset=['Mouse ID','Timepoint'], keep=False)]


unclean_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
916,g989,Propriva,Female,21,26,20,55.326122,1
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
917,g989,Propriva,Female,21,26,20,54.65765,1
915,g989,Propriva,Female,21,26,15,53.44202,0
909,g989,Propriva,Female,21,26,0,45.0,0
908,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0


In [108]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df[~combined_df['Mouse ID'].isin(duped_mice_id)]

clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
460,a203,Infubinol,Female,20,23,30,59.523197,1
461,a203,Infubinol,Female,20,23,35,61.931650,2
459,a203,Infubinol,Female,20,23,25,56.793208,1
458,a203,Infubinol,Female,20,23,20,55.173336,1
457,a203,Infubinol,Female,20,23,15,52.777870,1
...,...,...,...,...,...,...,...,...
1884,z969,Naftisol,Male,9,30,5,49.332999,0
1885,z969,Naftisol,Male,9,30,10,52.656081,1
1886,z969,Naftisol,Male,9,30,15,54.713005,2
1887,z969,Naftisol,Male,9,30,20,57.898778,2


In [81]:
# Checking the number of mice in the clean DataFrame.
count_clean_df = len(clean_df['Mouse ID'].unique())
print(f'Number of Mice that have no Timepoint duplication: {count_clean_df}')

Number of Mice that have no Timepoint duplication: 249


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 

# mean, median, variance, standard deviation, and SEM of the tumor volume. 
grouped_clean = clean_df.groupby(by=['Drug Regimen'])
clean_df_mean = grouped_clean.mean()
clean_df_median = grouped_clean.median()
clean_df_var = grouped_clean.var()
clean_df_stdev = grouped_clean.std()
clean_df_sem = grouped_clean.sem()

# Assemble the resulting series into a single summary dataframe.
summary_stats_df = pd.DataFrame({'TumorVol Mean':clean_df_mean['Tumor Volume (mm3)'],
                                'TumorVol Median':clean_df_median['Tumor Volume (mm3)'],
                                'TumorVol Variance':clean_df_var['Tumor Volume (mm3)'],
                                 'TumorVol StdDev':clean_df_stdev['Tumor Volume (mm3)'],
                                 'TumorVol StdErr':clean_df_sem['Tumor Volume (mm3)']})


summary_stats_df


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
df_grouped = clean_df.groupby(by='Drug Regimen')

# Using the aggregation method, produce the same summary statistics in a single line
df_grouped['Tumor Volume (mm3)'].agg(['mean','median','var','std','sem'])


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
cmap = ListedColormap(['#e50000'])
grouped_count_byRegimen = grouped_clean['Mouse ID'].count()
fig = plt.figure(figsize = (10, 5))
count_gph = grouped_count_byRegimen.plot.bar(x=0,y=1,colormap=cmap,width=.4)

count_gph.set_xlabel('Regimen')
count_gph.set_ylabel('Number of Measurements Taken')
count_gph.set_title('Count of Measurements Taken per Regimen')

count_gph.set_ylim(0,36)
count_gph.set_xlim(-.75,9.75)

count_gph

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

fig = plt.figure(figsize = (10, 5))
x_values = grouped_count_byRegimen.index.tolist()
y_values = grouped_count_byRegimen
plt.bar(x_values,y_values, color ='red', width = 0.4)
  
plt.xlabel("Regimen") 
plt.ylabel("Number of Measurements Taken") 
plt.title("Count of Measurements Taken per Regimen") 
plt.show() 

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
grouped_sex = clean_df.groupby(by=['Sex']).count()
grouped_sex_df = grouped_sex['Mouse ID']

plot = grouped_sex_df.plot.pie(figsize=(10, 10))
plot.set_xlabel('Sex')
plot.set_ylabel('Count of Mice')
plot.set_title('Number of Mice per Gender')


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
x_values = grouped_sex_df.index.tolist()
y_values = grouped_sex_df

fig = plt.figure(figsize=(10, 10)) 
plt.pie(y_values, labels=x_values)

plt.xlabel("Sex") 
plt.ylabel("Count of Mice") 
plt.title("Number of Mice per Gender") 
plt.show() 

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

filtered_df = clean_df.loc[clean_df['Drug Regimen'].isin(['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])]

# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

max_timepoint_df = filtered_df[filtered_df['Timepoint'] == filtered_df.groupby('Mouse ID')['Timepoint'].transform('max')]

max_timepoint_df = max_timepoint_df[['Mouse ID','Drug Regimen','Tumor Volume (mm3)','Timepoint','Weight (g)']]
finalvol_df = max_timepoint_df.rename(columns={'Tumor Volume (mm3)': 'Final Tumor Vol'})
finalvol_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)

# Create empty list to fill with tumor vol data (for plotting)
tumor_vols = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
data = finalvol_df['Final Tumor Vol']
IQR = st.iqr(data, interpolation = 'midpoint') 
data_median = data.median()  

upper_bound = data_median + (.5*IQR)
lower_bound = data_median - (.5*IQR)

print(f'IQR: {round(IQR,4)}')
print(f'Upper Bound: {round(upper_bound,4)}')
print(f'Lower Bound: {round(lower_bound,4)}')

# Locate the rows which contain mice on each drug and get the tumor volumes
# add subset 
# Determine outliers using upper and lower bounds
outliers=[]

upper_df = finalvol_df.loc[(finalvol_df["Final Tumor Vol"] > upper_bound)]
lower_df = finalvol_df.loc[(finalvol_df["Final Tumor Vol"] < lower_bound)]
outliers.append(upper_df)
outliers.append(lower_df)




In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

fig = plt.figure(figsize =(10, 7)) 

capomulin_vol = finalvol_df.loc[finalvol_df['Drug Regimen'] == 'Capomulin']['Final Tumor Vol']
ramicane_vol = finalvol_df.loc[finalvol_df['Drug Regimen'] == 'Ramicane']['Final Tumor Vol']
ceftamin_vol = finalvol_df.loc[finalvol_df['Drug Regimen'] == 'Ceftamin']['Final Tumor Vol']
infubinol_vol = finalvol_df.loc[finalvol_df['Drug Regimen'] == 'Infubinol']['Final Tumor Vol']

# Creating plot 
data_to_plot = [capomulin_vol,ramicane_vol,ceftamin_vol,infubinol_vol]
box_plot = plt.boxplot(data_to_plot) 
xticks = ['Capomulin', 'Ramicane', 'Ceftamin', 'Infubinol']
plt.xticks([1,2,3,4],xticks)
# show plot 
plt.title('Final Tumor Volume within Major Regimens')
plt.xlabel('Regimens')
plt.ylabel('Final Tumor Volume')

plt.ylim(0,80)
plt.xlim(0.5,4.5)
#plt.grid()
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
y_value = clean_df.loc[clean_df['Mouse ID'] == 'm957']['Tumor Volume (mm3)']

x_value = clean_df.loc[clean_df['Mouse ID'] == 'm957']['Timepoint']

plt.plot(x_value,y_value)
plt.title('Tumor Volume vs Timepoint for Mouse ID m957')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume')
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
filtered_df = clean_df.loc[clean_df['Drug Regimen'] == 'Capomulin']

y = filtered_df.groupby(by=['Weight (g)'])['Tumor Volume (mm3)'].mean()
x_str = y.index.tolist()
x = np.array([float(i) for i in x_str])

plt.scatter(x,y)

plt.title('Avg Tumor Vol vs Mouse Weight for Capomulin Regimen')
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Tumor Volume')

plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
lr = st.linregress(y,x)

print(f'Correlation Coefficient = {lr[0]}')
print(" ")
print("Regression Statistics")
print("_______________________________________________")
print("")
print(f'Intercept = {lr[1]}')
print(f'r-value = {lr[2]}')
print(f'p-value = {lr[3]}')
print(f'Standard Error = {lr[4]}')

#plt.scatter(x_vals, lr[0]*x_vals + lr[1])
slope, intercept, r_value, p_value, std_err = st.linregress(x,y)
print(slope)
print(intercept)

plt.plot(x, y, 'o', label='Original Data')
plt.plot(x, intercept + slope * 2, 'r', label='Linear Regress')
plt.legend()
plt.show()