## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from jupyterthemes import jtplot
jtplot.style()

# Study data files
mouse_metadata_path = 'data/Mouse_metadata.csv'
study_results_path = 'data/Study_results.csv'

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
df_mouse_merge = mouse_metadata.merge(study_results, left_on = 'Mouse ID', right_on = 'Mouse ID')

# Display the data table for preview
df_mouse_merge.head()

In [None]:
# Checking the number of mice.
mouse_qty = len(pd.unique(df_mouse_merge['Mouse ID']))
mouse_qty

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse = df_mouse_merge.loc[df_mouse_merge.duplicated(subset=[
    'Mouse ID', 'Timepoint']), 'Mouse ID'].unique()
duplicate_mouse

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
show_duplicates = df_mouse_merge.loc[df_mouse_merge['Mouse ID'] == 'g989']
show_duplicates.head()

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse = df_mouse_merge.drop_duplicates().reset_index(drop=True)
clean_df = clean_mouse[clean_mouse['Mouse ID'].isin(duplicate_mouse)==False]
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
mouse_qty = clean_df['Mouse ID'].nunique()
mouse_qty

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties 
# of each drug regimen: mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

mean_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
median_stat= clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
var_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
stdv_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
sem_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

summary_df = pd.DataFrame({'Mean': mean_stat, 'Median': median_stat, 'Variance': 
                           var_stat, 'Std. Dev.': stdv_stat, 'SEM': sem_stat})
summary_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen using the aggregation method, 
# produce the same summary statistics in a single line
agg_group = clean_df.groupby('Drug Regimen')
agg_summary = agg_group.agg(['mean', 'median', 'var', 'std', 'sem'])['Tumor Volume (mm3)']
agg_summary.head()

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
regimen_df = clean_df.groupby(['Drug Regimen']).count().reset_index()
regimen_data = regimen_df[['Drug Regimen', 'Mouse ID']].rename(columns={'Mouse ID': 'Count'})
regimen_data = regimen_data.set_index('Drug Regimen')
regimen_data.plot(kind='bar', color='royalblue', figsize=(10,5,))

plt.grid(b=None)  # hide grid lines
plt.ylabel('Number of Measurements')

plt.gca().get_legend().remove()  # turn legend off
#plt.gca().get_bar().set_color('royalblue')
plt.title('Drug Regimen Measurements')
plt.show()

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
# create drug regiment data set
regimen_list = summary_df.index.tolist()
x_axis = regimen_list

# Create regimen count
regimen_count = (clean_df.groupby(['Drug Regimen'])['Mouse ID'].count()).tolist()
fig = plt.figure(figsize =(10, 5))  # format figure Size

# Format title
plt.title('Drug Regimen Measurements')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Measurements')
plt.xticks(rotation=90)
plt.grid(b=None)   # hide grid lines

# Plot bar chart
plt.bar(x_axis, regimen_count, color='royalblue', alpha=1, width=.5, align='center')
#plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
# Create dataframe grouping unique players by Gender
gender_count = pd.DataFrame(clean_df.groupby('Sex')['Mouse ID'].nunique())

# Rename column
gender_count = gender_count.rename(columns={'Mouse ID':'Total Count'})

# Add column Percentage and calculate gender percentage
gender_count['Percentage Split'] = gender_count['Total Count'] / sum(gender_count['Total Count'])

# Plot
explode = (0.1, 0)
plot = gender_count.plot.pie(title='Male vs. Female Mouse Population', y='Total Count', figsize=(6, 6), colors = ['pink', 'royalblue'], startangle=140, explode = explode, shadow = True, autopct="%1.1f%%")
plot.set_ylabel("")
plot.get_legend().remove()  # turn legend off
plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# gender count female, male
gender_count = (clean_df.groupby(['Sex'])['Mouse ID'].count()).tolist()

# labels for the sections of the pie chart
labels = ['Females', 'Males']
plt.title('Male vs Female Mouse Population')

sizes = gender_count            # The values of each section of the pie chart
colors = ['#ff9999','#66b3ff']  # Colors each section of the pie chart
explode = (0.1, 0)              # Tells matplotlib not to seperate the sections

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=['pink', 'royalblue'],
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.tight_layout()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

Capomulin_df = clean_df.loc[clean_df['Drug Regimen'] == 'Capomulin']
Ramicane_df = clean_df.loc[clean_df['Drug Regimen'] == 'Ramicane']
Infubinol_df = clean_df.loc[clean_df['Drug Regimen'] == 'Infubinol']
Ceftamin_df = clean_df.loc[clean_df['Drug Regimen'] == 'Ceftamin']

# Capomulin final tumor volume at max timepoint
capo_max = Capomulin_df.groupby('Mouse ID')['Timepoint'].max()
rami_max = Capomulin_df.groupby('Mouse ID')['Timepoint'].max()
infu_max = Capomulin_df.groupby('Mouse ID')['Timepoint'].max()
ceft_max = Capomulin_df.groupby('Mouse ID')['Timepoint'].max()
####   **** FINISH REGIMEN NAMES
# Capomulin. Merge final tumor vol with clean_df dataframe to get the tumor volume at the last timepoint
capo_merge = pd.merge(capo_max, clean_df, on= ('Mouse ID', 'Timepoint'),how='left')
capo_merge.head()

# Ramicane. Merge final tumor vol with clean_df dataframe to get the tumor volume at the last timepoint
rami_merge = pd.merge(capo_max, clean_df, on= ('Mouse ID', 'Timepoint'),how='left')
capo_merge.head()

# Infubinol. Merge final tumor vol with clean_df dataframe to get the tumor volume at the last timepoint
infu_merge = pd.merge(capo_max, clean_df, on= ('Mouse ID', 'Timepoint'),how='left')
capo_merge.head()

# Ceftamin. Merge final tumor vol with clean_df dataframe to get the tumor volume at the last timepoint
ceft_merge = pd.merge(capo_max, clean_df, on= ('Mouse ID', 'Timepoint'),how='left')
capo_merge.head()


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin. Calculate the quartiles and IQR and quantitatively 
# Determine if there are any potential outliers across all four treatment regimens.

list_key_drugs = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']  
filter_drug = merge_data[merge_data['Drug Regimen'].isin(list_key_drugs)]
tumor_vol = filter_drug.sort_values(['Drug Regimen']).groupby(['Drug Regimen'], sort=False)

# Filter_drug
tumor_vol.head()

In [None]:
# Capomulin
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
capo_df = clean_df.loc[clean_df["Drug Regimen"] == 'Capomulin',:]

# Capomulin IQR
capo_quar = capo_df['Tumor Volume (mm3)']
quartiles = capo_quar.quantile([.25,.5,.75])
lowerquart = quartiles[.25]
upperquart = quartiles[.75]
iqr = upperquart-lowerquart

print(f'The lower quartile of the tumor volume is: {lowerquart}')
print(f'The upper quartile of the tumor volume is: {upperquart}')
print(f'The interquartile range of the tumor volume is: {iqr}')
print(f'The the median of tumor the volume is: {quartiles[.5]} ')

# Determine outliers using upper and lower bounds
high = upperquart + (1.5*iqr)
low = lowerquart - (1.5*iqr)    

print(f'Values below {low} could be outliers.')
print(f'Values above {high} could be outliers.')

In [None]:
# Ceftamin. Calculate the IQR and quantitatively determine if there are any potential outliers. 
ceft_df = clean_df.loc[clean_df["Drug Regimen"] == 'Ceftamin',:]

# Capomulin IQR
ceft_quar = ceft_df['Tumor Volume (mm3)']
quartiles = ceft_quar.quantile([.25,.5,.75])
lowerquart = quartiles[.25]
upperquart = quartiles[.75]
iqr = upperquart-lowerquart

print(f'The lower quartile of the tumor volume is: {lowerquart}')
print(f'The upper quartile of the tumor volume is: {upperquart}')
print(f'The interquartile range of the tumor volume is: {iqr}')
print(f'The the median of tumor the volume is: {quartiles[.5]} ')

# Determine outliers using upper and lower bounds
high = upperquart + (1.5*iqr)
low = lowerquart - (1.5*iqr)    

print(f'Values below {low} could be outliers.')
print(f'Values above {high} could be outliers.')

In [None]:
# Ramicane, Infubinol. Calculate the IQR and quantitatively determine if there are any potential outliers. 
rami_df = clean_df.loc[clean_df["Drug Regimen"] == 'Ramicane',:]

# Capomulin IQR
rami_quar = rami_df['Tumor Volume (mm3)']
quartiles = rami_quar.quantile([.25,.5,.75])
lowerquart = quartiles[.25]
upperquart = quartiles[.75]
iqr = upperquart-lowerquart

print(f'The lower quartile of the tumor volume is: {lowerquart}')
print(f'The upper quartile of the tumor volume is: {upperquart}')
print(f'The interquartile range of the tumor volume is: {iqr}')
print(f'The the median of tumor the volume is: {quartiles[.5]} ')

# Determine outliers using upper and lower bounds
high = upperquart + (1.5*iqr)
low = lowerquart - (1.5*iqr)    

print(f'Values below {low} could be outliers.')
print(f'Values above {high} could be outliers.')

In [None]:
# Infubinol. Calculate the IQR and quantitatively determine if there are any potential outliers. 
infu_df = clean_df.loc[clean_df["Drug Regimen"] == 'Infubinol',:]

# Capomulin IQR
rami_quar = rami_df['Tumor Volume (mm3)']
quartiles = rami_quar.quantile([.25,.5,.75])
lowerquart = quartiles[.25]
upperquart = quartiles[.75]
iqr = upperquart-lowerquart

print(f'The lower quartile of the tumor volume is: {lowerquart}')
print(f'The upper quartile of the tumor volume is: {upperquart}')
print(f'The interquartile range of the tumor volume is: {iqr}')
print(f'The the median of tumor the volume is: {quartiles[.5]} ')

# Determine outliers using upper and lower bounds
high = upperquart + (1.5*iqr)
low = lowerquart - (1.5*iqr)    

print(f'Values below {low} could be outliers.')
print(f'Values above {high} could be outliers.')

infu_df

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
plot_regimen_data =[capo_df, rami_df, infu_df, ceft_df]
regimen_labels = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

fig1, ax = plt.subplots(figsize=(15, 10))
ax.set_title('Tumor Volume of Selected Drug Regimen',fontsize =25)
ax.set_ylabel('Final Tumor Volume (mm3)',fontsize = 14)
ax.set_xlabel('Drug Regimen',fontsize = 14)
ax.boxplot(plot_regimen_data, labels=regimen_labels, widths = 0.4, patch_artist=True,vert=True)

plt.ylim(10, 80)

plt.savefig("../Images/box_plot.png", bbox_inches = "tight")

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen


In [None]:
####tumor_vol = filter_drug.groupby

drug_list = filter_drug['Drug Regimen'].tolist()
drug_list = list(set(drug_list))
print(drug_list)

# Create empty list to fill with tumor vol data (for plotting)
empty_tumor_list = []
##### ***** END OF SECTION ***** #####