## Observations and Insights 

In [16]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
df = pd.merge(mouse_metadata, study_results, on = "Mouse ID")
# Display the data table for preview
df.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [15]:
print(len(df))

1893


In [2]:
# Checking the number of mice.
number_of_mice = len(df['Mouse ID'].unique())
number_of_mice

249

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicated = df[df.duplicated(['Mouse ID','Timepoint'])]
duplicated['Mouse ID']

In [10]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicated

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [13]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
df.drop(df[df['Mouse ID'] == 'g989'].index, inplace = True)
len(df)

1880

In [12]:
# Checking the number of mice in the clean DataFrame.
number_of_mice = len(df['Mouse ID'].unique())
number_of_mice

248

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary satatistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

summary = df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').mean()
summary['Median'] = df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').median()
summary['Variance'] = df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').var()
summary['Std. Dev.'] = df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').std()
summary['SEM'] = df[['Drug Regimen','Tumor Volume (mm3)']].groupby('Drug Regimen').sem()
summary = summary.rename(columns = {'Tumor Volume (mm3)': 'Mean'})

summary



In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
df.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std','sem']})            

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
drug_group = df.groupby('Drug Regimen')
counts = drug_group['Drug Regimen'].count()
count = counts.sort_values(ascending = False)
drug_chart = count.plot(kind = 'bar', rot = 45, title = "Total Measurements per Drug Regimen")

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

plt.bar(count.index, count, width = 0.5, color='b',alpha = 1, align = 'center')
plt.xticks(rotation=45)
plt.xlabel("Drug Regimen")
plt.title ="Total Measurements per Drug Regimen"
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
df_time_0 = df.loc[df['Timepoint'] == 0]
df_gend = df_time_0[['Mouse ID','Sex']]
gender = df_gend['Sex']
counts = gender.value_counts()
counts
plot = counts.plot.pie(y= 'Sex', figsize=(4, 4))


# df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97],
#                    'radius': [2439.7, 6051.8, 6378.1]},
#                   index=['Mercury', 'Venus', 'Earth'])
# plot = df.plot.pie(y='mass', figsize=(5, 5))


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(counts, labels = counts.index, autopct = "%1.1f%%", shadow = True, explode = (0.1, 0))
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

#df = df.loc[(df['Drug Regimen'] == 'Capomulin') | (df['Drug Regimen'] == 'Ramican') |(df['Drug Regimen'] == 'Infubinol') | (df['Drug Regimen'] == 'Ceftamin')]
group = df.groupby('Mouse ID')

# Start by getting the last (greatest) timepoint for each mouse
great = group['Timepoint'].max()


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_df = pd.merge(df, great, on = "Mouse ID")

merge_df = merge_df.loc[(merge_df['Drug Regimen'] == 'Capomulin') | (merge_df['Drug Regimen'] == 'Ramicane') |(merge_df['Drug Regimen'] == 'Infubinol') | (merge_df['Drug Regimen'] == 'Ceftamin')]
merge_df['Drug Regimen'].value_counts()

final = merge_df.loc[merge_df['Timepoint_x'] == merge_df['Timepoint_y']]
final_volume = final[['Mouse ID','Drug Regimen','Tumor Volume (mm3)']]
volume = final_volume['Tumor Volume (mm3)']
final_volume

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = merge_df['Drug Regimen'].unique()
treatments

# Create empty list to fill with tumor vol data (for plotting)
tumor_volume = []

    
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
quartiles = volume.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of volume is: {lowerq}")
print(f"The upper quartile of volume is: {upperq}")
print(f"The interquartile range of volume is: {iqr}")
print(f"The the median of volume is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
for drug in treatments:
        tumor_volume.append(final_volume[final_volume['Drug Regimen'] == drug]['Tumor Volume (mm3)'])


    # add subset 
    
    
    # Determine outliers using upper and lower bounds
outliers = final_volume.loc[(final_volume['Tumor Volume (mm3)'] < lower_bound) | (final_volume['Tumor Volume (mm3)'] > upper_bound)]
print(f"The number of outliers is {len(outliers)}.")


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume Across Four Regimens')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')
x_axis = np.arange(1,5)
tick_locations=[]
for x in x_axis:
    tick_locations.append(x)
ax1.boxplot(tumor_volume)
plt.xticks(tick_locations,treatments)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

# Make a list of Mouse ID's for treatment Capomulin
treat_group = df.loc[df['Drug Regimen'] == 'Capomulin']
mice = treat_group['Mouse ID'].unique()
print(mice)
print(len(mice))

In [None]:
# Choose a mouse from the mice list from index 0-24
mouse = mice[1]
capo = df.loc[ df['Mouse ID'] == mouse]
group = capo.groupby('Mouse ID')

# Make the line plot for a mouse
x_axis = 'Timepoint'
y_axis = 'Tumor Volume (mm3)'
group.plot.line(x_axis, y_axis)


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
# Get all the rows for Capomulin
group = df.loc[df['Drug Regimen'] == 'Capomulin']

# Get the average of tumor volume and merge list with data frame group
avg =  group[['Weight (g)','Tumor Volume (mm3)']].groupby('Weight (g)').mean()
merge = pd.merge(group, avg, on = 'Weight (g)')

# Get the x and y values and make the scatter plot
x_values = merge['Weight (g)']
y_values = merge['Tumor Volume (mm3)_y']
plt.scatter(x_values, y_values)
plt.xlabel('Weight in grams')
plt.ylabel('Average Tumor Volume')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(x_values, y_values)
print(f"The correlation coefficient is {correlation[0]}")

In [None]:
# linear regression model for mouse weight and average tumor volume for the Capomulin regimen

# Get the values for the regression, make the line equation
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Make the scatter plot with regression line
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,37),fontsize=15,color="red")
plt.xlabel('Weight in grams')
plt.ylabel('Average Tumor Volume')
plt.show()