## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from jupyterthemes import jtplot
jtplot.style()

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
df_mouse_merge = mouse_metadata.merge(study_results, left_on = 'Mouse ID', right_on = 'Mouse ID')

# Display the data table for preview
df_mouse_merge


In [None]:
# Checking the number of mice.
mouse_qty = len(pd.unique(df_mouse_merge['Mouse ID']))
mouse_qty

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse = df_mouse_merge.loc[df_mouse_merge.duplicated(subset=['Mouse ID', 'Timepoint']), 'Mouse ID'].unique()
duplicate_mouse

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
show_duplicates = df_mouse_merge.loc[df_mouse_merge['Mouse ID'] == 'g989']
show_duplicates

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse = df_mouse_merge.drop_duplicates().reset_index(drop=True)
clean_df = clean_mouse[clean_mouse['Mouse ID'].isin(duplicate_mouse)==False]
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
mouse_qty = clean_df['Mouse ID'].nunique()
mouse_qty

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

mean_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
median_stat= clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
var_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
stdv_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
sem_stat = clean_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

summary_df = pd.DataFrame({'Mean': mean_stat, 'Median': median_stat, 'Variance': var_stat, 'Std. Dev.': stdv_stat, 'SEM': sem_stat})
summary_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
agg_group = clean_df.groupby('Drug Regimen')
agg_summary = agg_group.agg(['mean', 'median', 'var', 'std', 'sem'])['Tumor Volume (mm3)']
agg_summary

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
regimen_df = clean_df.groupby(['Drug Regimen']).count().reset_index()
regimen_data = regimen_df[['Drug Regimen', 'Mouse ID']].rename(columns={"Mouse ID": "Count"})

regimen_data = regimen_data.set_index('Drug Regimen')

regimen_data.plot(kind='bar', figsize=(10,5))

# Hide grid lines
plt.grid(b=None)

plt.title('Drug Regimen Measurements')
plt.show()

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.

# create drug regiment data set
regimen_list = summary_df.index.tolist()
x_axis = regimen_list

# create regimen count
regimen_count = (clean_df.groupby(['Drug Regimen'])['Mouse ID'].count()).tolist()

# format figure Size
fig = plt.figure(figsize =(10, 5))

# format title
plt.title('Drug Regimen Measurements')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Measurements')

# Hide grid lines
plt.grid(b=None)

# plot bar chart
plt.bar(x_axis, regimen_count, color='c', alpha=1, align='center')
plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
# create dataframe grouping unique players by Gender
gender_count = pd.DataFrame(clean_df.groupby('Sex')['Mouse ID'].nunique())

# rename column
gender_count = gender_count.rename(columns={'Mouse ID':'Total Count'})

# add column Percentage and calculate gender percentage
gender_count['Percentage Split'] = gender_count['Total Count'] / sum(gender_count['Total Count'])

# plot
colors = ['pink', 'blue']
explode = (0.1, 0)
plot = gender_count.plot.pie(y='Total Count', figsize=(5, 5), colors = colors, startangle=140, explode = explode, shadow = True, autopct="%1.1f%%")
plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# Gender count female, male
gender_count = (clean_df.groupby(['Sex'])['Mouse ID'].count()).tolist()

# Labels for the sections of the pie chart
labels = ['Females', 'Males']
plt.title('Male vs Female Mouse Population')

# The values of each section of the pie chart
sizes = gender_count

# Colors each section of the pie chart
colors = ['#ff9999','#66b3ff']

# Tells matplotlib not to seperate the sections
explode = (0.1, 0)

# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.tight_layout()

## Quartiles, Outliers and Boxplots

In [None]:
# Display cleaned data
clean_df.head()

In [None]:
capo_df = max_timepoint.loc[max_timepoint['Drug Regimen'] == 'Capomulin',:]
rami_df = max_timepoint.loc[max_timepoint['Drug Regimen'] == 'Ramicane',:]
infu_df = max_timepoint.loc[max_timepoint['Drug Regimen'] == 'Infubinol',:]
ceft_df = max_timepoint.loc[max_timepoint['Drug Regimen'] == 'Ceftamin',:]

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
# Capomulin
capo_max = capo_df.groupby('Mouse ID').max()['Timepoint']
capo_vol = pd.DataFrame(capo_max)
capo_merge = pd.merge(capo_vol, clean_df, on=('Mouse ID', 'Timepoint'),how='left')
capo_merge.head()

In [None]:
# Ramicane. Merge df with original df (clean_df) to find tumor volume
rami_max = rami_df.groupby('Mouse ID').max()['Timepoint']
rami_vol = pd.DataFrame(rami_max)
rami_merge = pd.merge(rami_vol, clean_df, on=('Mouse ID', 'Timepoint'),how='left')

In [None]:
# Infubinol. Merge df with original df (clean_df) to find tumor volume. 
infu_max = infu_df.groupby('Mouse ID').max()['Timepoint']
infu_vol = pd.DataFrame(infu_max)
infu_merge = pd.merge(rami_vol, clean_df, on=('Mouse ID', 'Timepoint'),how='left')

In [None]:
# Ceftamin. Merge df with original df (clean_df) to find tumor volume. 
ceft_max = ceft_df.groupby('Mouse ID').max()['Timepoint']
ceft_vol = pd.DataFrame(ceft_max)
ceft_merge = pd.merge(ceft_vol, clean_df, on=('Mouse ID', 'Timepoint'),how='left')

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Capomulin
quartile = capo_merge.quantile([0.25,0.5,0.75])
quartile
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
