## Observations and Insights

## Dependencies and starter code

In [105]:
%matplotlib notebook

In [106]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
import os

# Study data files
file_to_load1 = os.path.join("data", "Mouse_metadata.csv")
file_to_load2 = os.path.join("data", "Study_results.csv")

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(file_to_load1)
study_results = pd.read_csv(file_to_load2)

# Combine the data into a single dataset
mouseStudyResults = pd.merge(mouse_metadata, study_results, how='outer', on="Mouse ID")
mouseStudyResults.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


## Summary statistics

In [107]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM
#   of the tumor volume for each regimen

groupedDrugs = mouseStudyResults.groupby('Drug Regimen')['Tumor Volume (mm3)']

drugs = pd.DataFrame({
    "Mean Tumor Volume" : groupedDrugs.mean()
})

drugs["Median Tumor Volume"] = mouseStudyResults.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
drugs["Tumor Volume Variance"] = mouseStudyResults.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
drugs["Tumor Size Standard Deviation"] = mouseStudyResults.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()

instancesDrug = mouseStudyResults.groupby("Drug Regimen")["Drug Regimen"].count()

drugs["Standard Error of the Mean"] = drugs["Tumor Volume Variance"]/instancesDrug

drugs

Unnamed: 0_level_0,Mean Tumor Volume,Median Tumor Volume,Tumor Volume Variance,Tumor Size Standard Deviation,Standard Error of the Mean
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.108469
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.220731
Infubinol,52.884795,51.820584,43.128684,6.567243,0.242296
Ketapril,55.235638,53.698743,68.553577,8.279709,0.364647
Naftisol,54.331565,52.509285,66.173479,8.134708,0.355771
Placebo,54.033581,52.288934,61.168083,7.821003,0.337945
Propriva,52.322552,50.854632,42.35107,6.50777,0.26305
Ramicane,40.216745,40.673236,23.486704,4.846308,0.103012
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.328456
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.266667


## Bar plots

In [110]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
#eachDrug = drugs['Drugs']
x_axis = np.arange(len(drugs))

drugs.plot(kind="bar", x=drugs["Drug Regimen"], y=instancesDrug)

AttributeError: 'numpy.ndarray' object has no attribute 'plot'

In [109]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
drugs = mouseStudyResults['Drug Regimen'].unique()
dataPtsPerDrug = instancesDrug
x_axis = np.arange(len(drugs)) 

plt.bar(x_axis, dataPtsPerDrug, color="b", align="center")

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drugs, rotation=45)
plt.title("Amount of Data Collected per Drug")
plt.ylabel("Sample Size")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Sample Size')

## Pie plots

In [5]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [6]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, outliers and boxplots

In [7]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [8]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [9]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [10]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [11]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen