## Observations and Insights 

In [None]:
# Dependencies and Setup
import os
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = os.path.join("data","Mouse_metadata.csv")
study_results_path = os.path.join("data","Study_results.csv")

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
study_complete = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID"])

In [None]:
# Checking the number of mice in the DataFrame.
num_unique_mice = len(study_complete['Mouse ID'].value_counts())
num_data_points = len(study_complete['Mouse ID'])
print(f'There are {num_unique_mice} unique mice IDs in the DataFrame, across {num_data_points} data points.')

In [None]:
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Optional: Get all the data for the duplicate mouse ID. 
study_complete[study_complete.duplicated(subset=["Mouse ID","Timepoint"] , keep='first')]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID. 
#Applying inplace to existing DataFrame to save memory.
study_complete.drop_duplicates(subset=["Mouse ID","Timepoint"],keep='first', inplace=True)

In [None]:
# Checking the number of mice in the clean DataFrame.
num_unique_mice2 = len(study_complete['Mouse ID'].value_counts())
num_data_points2 = len(study_complete['Mouse ID'])
print(f'There are {num_unique_mice2} unique mice IDs in the DataFrame, across {num_data_points2} data points.')
print(f'Tidying up duplicates has removed {num_unique_mice2-num_unique_mice} mice and {num_data_points-num_data_points2} datapoints.')

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method creates multiple series and puts them all together at the end.
drug_list = study_complete['Drug Regimen'].unique()
sum_stats = pd.DataFrame(index = drug_list)
for drug in drug_list:
    sum_stats.loc[drug,"Tumor Volume Mean"] = study_complete.loc[study_complete['Drug Regimen'] == drug]['Tumor Volume (mm3)'].mean()
    sum_stats.loc[drug,"Tumor Volume Median"] = study_complete.loc[study_complete['Drug Regimen'] == drug]['Tumor Volume (mm3)'].median()
    sum_stats.loc[drug,"Tumor Volume Variance"] = study_complete.loc[study_complete['Drug Regimen'] == drug]['Tumor Volume (mm3)'].var()
    sum_stats.loc[drug,"Tumor Volume Standard Deviation"] = study_complete.loc[study_complete['Drug Regimen'] == drug]['Tumor Volume (mm3)'].std()
    sum_stats.loc[drug,"Tumor Volume SEM"] = study_complete.loc[study_complete['Drug Regimen'] == drug]['Tumor Volume (mm3)'].sem()
pd.options.display.float_format = '{:,.2f}'.format
sum_stats

In [None]:
#Same thing, using Groupby
grouped_drugs_df = study_complete.groupby(['Drug Regimen'])
sum_stats_grouped = pd.DataFrame({'Tumor Volume Mean': grouped_drugs_df['Tumor Volume (mm3)'].mean(),
                                  'Tumor Volume Median': grouped_drugs_df['Tumor Volume (mm3)'].median(),
                                  'Tumor Volume Variance': grouped_drugs_df['Tumor Volume (mm3)'].var(),
                                  'Tumor Volume Standard Deviation': grouped_drugs_df['Tumor Volume (mm3)'].std(),
                                  'Tumor Volume SEM': grouped_drugs_df['Tumor Volume (mm3)'].sem()
},index = drug_list)
sum_stats_grouped

## Bar Plots

In [None]:
#First, let's look at a graph for all drugs combined, and show how the number of mice drops over time.
grouped_timepoint_df = study_complete.groupby(['Timepoint'])
mice_per_timepoint = grouped_timepoint_df['Mouse ID'].count()
y_axis = mice_per_timepoint
x_axis = study_complete['Timepoint'].unique()
plt.bar(x_axis, y_axis, color="b", align="center", label= 'All Drugs')
plt.title("Number of mice per time point for each treatment")
plt.xlabel('Time Point')
plt.ylabel('Number of Mice')
plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
#Now, let's look at the same pattern for individual drugs.
#Using matplotlib.pyplot as plt
x_axis = study_complete['Timepoint'].unique() 
z = 0
fig, axes = plt.subplots(1, 10)
for drug in drug_list:
    y_axis = study_complete[study_complete['Drug Regimen'] == drug].groupby('Timepoint')['Mouse ID'].count()
    axes[z].bar(x_axis, y_axis,align="center", label= drug, width = 1)
    axes[z].set_title(drug)
    axes[z].set_ylabel('Number of Mice')
    axes[z].set_xlabel('Timepoint')
    z = z+1
fig.set_size_inches(20, 10, forward=True)
fig.tight_layout()
plt.show()

In [None]:
#Same thing, using pandas as pd
pandas_df = study_complete.groupby(["Drug Regimen", "Timepoint"])
pandas_df = pandas_df['Mouse ID'].count()
z=0
fig, axes = plt.subplots(1, 10)
for drug in drug_list:
        p_df= pandas_df[drug]
        p_df.plot(ax=axes[z],kind="bar", label= drug, rot=0,  width = 0.2, fontsize = 8)
        axes[z].set_title(drug)
        axes[z].set_ylabel('Number of Mice')
        axes[z].set_xlabel('Timepoint')
        z = z+1
fig.set_size_inches(20, 10, forward=True)
fig.tight_layout()
plt.show()

## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
pandas_df = study_complete.groupby(["Sex"])
pandas_df = pandas_df['Mouse ID'].count()
pandas_df.plot(kind = 'pie',autopct="%1.1f%%",startangle=100, title = False, shadow=True, legend = False)
plt.tight_layout()
print("Sex ratio by Mouse ID")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
grouped_sex_df = study_complete.groupby(['Sex'])
mice_per_sex = grouped_sex_df['Mouse ID'].count()
colors = ["yellowgreen", "red"]
labels = ['female','male']
plt.pie(mice_per_sex, labels=labels, colors=colors,autopct="%1.1f%%",startangle=100)
plt.tight_layout()
print("Sex ratio by Mouse ID")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
#Which drugs show the most promise? The ones with the smallest fall in Number of Mice between timepoint 0 and 45.
t45_df = {}
pandas_df = study_complete.groupby(['Drug Regimen','Timepoint'])
pandas_df = pandas_df['Drug Regimen'].count()
pandas_df = pd.DataFrame(pandas_df)
for drug in drug_list:
    t45_df[drug] = pandas_df.loc[(drug,45),"Drug Regimen"]
t45_df = sorted(t45_df.items(), key=lambda kv: kv[1], reverse=True)
print(f'The Drug Regimen with the smallest fall in Mice between timepoint 0 and 45 are {t45_df[0]}, {t45_df[1]}, {t45_df[2]}, and {t45_df[3]}')

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
#filter the original data set to only the top four regimens, and only mice that survived to Timepoint 45 (to reduce survivorship bias)
drug_shortlist = ['Capomulin','Ramicane','Zoniferol','Ceftamin']
final_tumor_size_df = study_complete[['Drug Regimen','Mouse ID','Tumor Volume (mm3)','Timepoint']]
final_tumor_size_df = final_tumor_size_df.loc[final_tumor_size_df['Timepoint']==45]
final_tumor_size_df = final_tumor_size_df.sort_values(by= 'Tumor Volume (mm3)')
for drug in drug_shortlist:
    drug_df = final_tumor_size_df.loc[final_tumor_size_df['Drug Regimen']== drug]['Tumor Volume (mm3)']
    iqr = np.subtract(*np.percentile(drug_df, [75, 25]))
    Q1 = np.percentile(drug_df, 25)
    Q3 = np.percentile(drug_df, 75)
    Q1_outlier = np.percentile(drug_df, 25) - iqr*1.5
    Q3_outlier = np.percentile(drug_df, 75) + iqr*1.5
    Q1_outlier_df = final_tumor_size_df.loc[final_tumor_size_df['Drug Regimen']==drug]['Tumor Volume (mm3)'] < Q1_outlier
    Q3_outlier_df = final_tumor_size_df.loc[final_tumor_size_df['Drug Regimen']==drug]['Tumor Volume (mm3)'] > Q3_outlier
    print(f'{drug}s median is {drug_df.median():.1f}, and the quartiles are {Q1:.1f} and {Q3:.1f}. IQR is {iqr:.1f}. Values> {Q3_outlier:.1f} or <{Q1_outlier:.1f} could be outliers.')
    print(f'For {drug}, these values might be:{np.where(Q1_outlier_df)[0]} and {np.where(Q3_outlier_df)[0]}')

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
for drug in drug_shortlist:
    drug_df = final_tumor_size_df.loc[final_tumor_size_df['Drug Regimen']== drug]['Tumor Volume (mm3)']
    fig1, ax1 = plt.subplots()
    ax1.set_title(drug)
    ax1.set_ylabel('Tumor Volume (mm3)')
    ax1.boxplot(drug_df)
    plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
y_axis = study_complete[study_complete['Drug Regimen'] == 'Capomulin'].groupby('Timepoint')['Tumor Volume (mm3)'].mean()
x_axis = study_complete['Timepoint'].unique()
plt.plot(x_axis, y_axis, color='blue', label="Tumor Volume (mm3)")
plt.title("Timepoint v tumor volume for a mouse treated with Capomulin")
plt.xlabel('Time Point')
plt.ylabel('Tumor Volume (mm3)')
plt.legend(loc="best")
plt.tight_layout()
plt.show()


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_df = study_complete.loc[study_complete['Drug Regimen']== 'Capomulin']
grouped_id_df = capomulin_df.groupby(['Mouse ID'])
df = grouped_id_df['Weight (g)'].mean()
df = pd.DataFrame(df)
df['Tumor Volume (mm3)'] = grouped_id_df['Tumor Volume (mm3)'].mean()
df.plot(kind = 'scatter',x = 'Weight (g)', y='Tumor Volume (mm3)')
plt.title("Mouse weight v average tumor volume for the Capomulin regimen")
plt.xlabel('Mouse weight')
plt.ylabel('Tumor Volume (mm3)')
plt.tight_layout()
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
mouse_weight = df['Weight (g)']
tumor_volume = df['Tumor Volume (mm3)']
correlation = st.pearsonr(mouse_weight,tumor_volume)
print(f"The Pearson correlation coefficient between both factors is {round(correlation[0],2)}")

In [None]:
x_values = df['Weight (g)']
y_values = df['Tumor Volume (mm3)']
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
print(f' The linear regression model is {line_eq}')

In [None]:
#Repeating the scatter plot, including the linear regression model
df.plot(kind = 'scatter',x = 'Weight (g)', y='Tumor Volume (mm3)')
plt.title("Mouse weight v average tumor volume for the Capomulin regimen")
plt.xlabel('Mouse weight')
plt.ylabel('Tumor Volume (mm3)')
plt.annotate(line_eq,(20,36),fontsize=15,color="red")
plt.tight_layout()
plt.show()

In [None]:
#Observations or insights about the data.
print('It is possible to draw initial conclusions from this analysis. Here are three observable trends based on the data.')
print(f'1. There was one duplicated data point, which was removed.')
print(f'2. The data set is slightly (50.7%) weighted towards being Male.')
print(f'3. There is a positive correlation between Mouse weight v average tumor volume for the Capomulin regimen')