## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata_df = pd.read_csv(mouse_metadata_path)
study_results_df = pd.read_csv(study_results_path)
#mouse_metadata
mouse_metadata_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [2]:
#study_results
study_results_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [3]:
# Combine the data into a single dataset
combined_data_df=pd.merge(mouse_metadata_df,study_results_df, on = "Mouse ID")
combined_data_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [4]:
# Checking the number of mice.
combined_data_df["Mouse ID"].count()

1893

In [5]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
grouped_combined_data_df=combined_data_df.groupby("Mouse ID")
mouse_timepoint_comined_df = grouped_combined_data_df["Timepoint"].value_counts()
mouse_timepoint_comined_df.head()

Mouse ID  Timepoint
a203      0            1
          5            1
          10           1
          15           1
          20           1
Name: Timepoint, dtype: int64

In [6]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_data_df["Mouse ID"].value_counts()



g989    13
w151    10
k403    10
a251    10
s337    10
        ..
x336     1
h428     1
o848     1
x226     1
b447     1
Name: Mouse ID, Length: 249, dtype: int64

In [7]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
combined_data_df.groupby(["Mouse ID"]).dropna['g989']

AttributeError: 'DataFrameGroupBy' object has no attribute 'dropna'

In [None]:
# Checking the number of mice in the clean DataFrame.
combined_data_df['Mouse ID'].count()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

regimen_grouped = combined_data_df.groupby(["Drug Regimen"])
regimen_grouped


#create computation for the mean of each regimen
regimen_mean = regimen_grouped["Tumor Volume (mm3)"].mean().round(3)

#Create computation for the median of each regimen
regimen_median = regimen_grouped["Tumor Volume (mm3)"].median().round(3)

#Create computation for the variance of each regimen
regimen_variance = regimen_grouped["Tumor Volume (mm3)"].var().round(3)

#create computation for the standard deviation of each regimen
regimen_std = regimen_grouped["Tumor Volume (mm3)"].std().round(3)

#create computation for the SEM
regimen_sem = regimen_grouped["Tumor Volume (mm3)"].sem().round(3)


# Assemble the resulting series into a single summary dataframe.
summary_regimen_grouped_df=pd.DataFrame({'Mean':regimen_mean,'Median':regimen_median,'Variance':regimen_variance,
                                        'Standard deviation':regimen_std,'SEM':regimen_sem})
summary_regimen_grouped_df


In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
drug_group_stats = regimen_grouped.agg(
        
        Tumor_Vol_Mean=('Tumor Volume (mm3)', np.mean),
        
        Tumor_Vol_Median=('Tumor Volume (mm3)', np.median),
        
        Tumor_Vol_Var=('Tumor Volume (mm3)', np.var),
        
        Tumor_Vol_Stdev=('Tumor Volume (mm3)', np.std),
        
        Tumor_Vol_SEM=('Tumor Volume (mm3)', st.sem)).round(3)
drug_group_stats

In [None]:
## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
grouped_regimen_df=combined_data_df.groupby("Drug Regimen")["Mouse ID"].count()
#grouped_regimen_df=pd.DataFrame("grouped_regimen_df")
grouped_regimen_df

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.

grouped_regimen_df.plot(kind="bar", figsize=(12,6))

#set chart title
plt.title("")
plt.xlabel("Drug Regimen")
plt.ylabel("count")

#show chart and set layout
plt.show()
plt.tight_layout()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mouse_gender=combined_data_df.groupby('Sex')["Mouse ID"].count()
mouse_gender


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = 'Male', 'Fmale'
sizes = [958, 935]
explode = (0.09, 0)  # 
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # this ensures that pie is drawn as a circle.

plt.show()

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
treatment_regimes = combined_data_df[combined_data_df["Drug Regimen"].isin(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])]

treatment_regimes = treatment_regimes.sort_values(["Timepoint"], ascending=True)
treatment_regimes

treatment_regimes_data = treatment_regimes[["Drug Regimen", "Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

treatment_regimes_data

# Start by getting the last (greatest) timepoint for each mouse
treatment_regimens_sort = combined_data_df.groupby(['Drug Regimen', 'Mouse ID']).last()['Tumor Volume (mm3)']
treatment_regimens_sort_df=pd.DataFrame(treatment_regimens_sort)
treatment_regimens_sort_df

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_df=pd.merge(combined_data_df,treatment_regimens_sort_df,on = "Tumor Volume (mm3)")
merge_df.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,0,45.0,0
2,k403,Ramicane,Male,21,16,0,45.0,0
3,k403,Ramicane,Male,21,16,0,45.0,0
4,k403,Ramicane,Male,21,16,0,45.0,0


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
