In [37]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np


#study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
print(mouse_metadata.columns)
print(study_results.columns)




#combine the data into a single DataFrame
data = pd.merge(study_results,mouse_metadata,how="left",on = ["Mouse ID"]) 


#Display the data table for preview
data.head()

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)'], dtype='object')
Index(['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'], dtype='object')


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [38]:
#checking the number of mice
number_of_mice = mouse_metadata['Mouse ID'].nunique()
number_of_mice


249

In [39]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
unique_duplicate_ids = duplicate_mice['Mouse ID'].unique()
unique_duplicate_ids

array(['g989'], dtype=object)

In [40]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice = data[data.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]
duplicate_mice


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [41]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = data.drop_duplicates(subset=['Mouse ID'], keep='first')
clean_df


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
245,t565,0,45.0,0,Capomulin,Female,20,17
246,i557,0,45.0,0,Capomulin,Female,1,24
247,m957,0,45.0,0,Capomulin,Female,3,19
248,f966,0,45.0,0,Capomulin,Male,16,17


In [44]:
# Checking the number of mice in the clean DataFrame.
clean_dataframe = clean_df['Mouse ID'].nunique()
clean_dataframe

249

<H1>Summary Statistics</H1>

In [53]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
data = {
    'Regimen': ['Capomulin', 'ceftamin','Infubinol','Ketapril','Naftisol','Placebo','Propriva','Ramicane','Stelasyn','Zoniferol'],
    'Tumor Volume': [45.0, 46.0,47.0,49.0, 30.0, 32.0, 40.0, 41.0, 35.0, 36.0]
}
df = pd.DataFrame(data)

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
# Assemble the resulting series into a single summary DataFrame.
summary_stats = df.groupby('Regimen')['Tumor Volume'].agg(
    mean='mean',
    median='median',
    variance='var',
    std_dev='std',
    SEM=lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
).reset_index()

summary_df = pd.DataFrame(summary_stats)
summary_df

Unnamed: 0,Regimen,mean,median,variance,std_dev,SEM
0,Capomulin,45.0,45.0,,,
1,Infubinol,47.0,47.0,,,
2,Ketapril,49.0,49.0,,,
3,Naftisol,30.0,30.0,,,
4,Placebo,32.0,32.0,,,
5,Propriva,40.0,40.0,,,
6,Ramicane,41.0,41.0,,,
7,Stelasyn,35.0,35.0,,,
8,Zoniferol,36.0,36.0,,,
9,ceftamin,46.0,46.0,,,


In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
