## Observations and Insights 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study_df= pd.merge(mouse_metadata, study_results, on="Mouse ID")
mouse_study_df
# Display the data table for preview


EmptyDataError: No columns to parse from file

In [None]:
# Checking the number of mice.
mouse_study_df.count()
mouse_study_df["Mouse ID"].value_counts()


In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_study_df.loc[mouse_study_df.duplicated(subset=["Mouse ID", "Timepoint"], keep=False),["Mouse ID", "Timepoint"]]
# dupe_df= mouse_study_df[ :, "Mouse ID","Timepoint"].drop_duplicates()
# dupe_df

In [None]:
# dupe2_df= dupe_df.drop_duplicates()
# dupe2_df

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
# dropped_df= duplicate_df.drop_duplicates(subset=['Mouse ID','Timepoint'], keep='first', inplace=True, ignore_index=False)
# dropped_df
# duplicate_df= dupe_df.duplicated(subset= 'Mouse ID', keep='first')
# duplicate_df
# new_df= mouse_study_df["Mouse ID"].drop_duplicates()
# new_df
mouse_study_df.loc[mouse_study_df.duplicated(subset=["Mouse ID", "Timepoint"], keep=False),:]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df= mouse_study_df.drop_duplicates(subset=["Mouse ID", "Timepoint"], keep='first', inplace=False,ignore_index=False) 
clean_df

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_df["Mouse ID"].value_counts()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_df= clean_df.loc[:,"Drug Regimen","Tumor Volume (mm3)"]
tumor_df

# This method is the most straighforward, creating multiple series and putting them all together at the end.
# clean_mean= tumor_df["Tumor Volume (mm3)"].mean()
# clean_median= tumor_df["Tumor Volume (mm3)"].median()
# clean_variance= tumor_df["Tumor Volume (mm3)"].var()
# clean_stdev= tumor_df["Tumor Volume (mm3)"].std()
# clean_sem= tumor_df["Tumor Volume (mm3)"].sem

# clean_summary_df= pd.DataFrame({"Mean": clean_mean,"Median": clean_median, "Variance": clean_variance,
#                                "Standard Deviation": clean_stdev, "SEM":clean_sem})

# clean_summary_df


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
clean_data_group = clean_df.groupby("Drug Regimen").agg(['mean', 'median', 'var', 'std', 'sem' ])
clean_data_group_tumor= clean_data_group["Tumor Volume (mm3)"]
clean_data_group_tumor

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
mice_drug_df= clean_df.groupby("Drug Regimen")
mice_count= mice_drug_df["Mouse ID"].count()
total_mice_drug_df= ({"Number of Mice": mice_count})
total_mice_drug_df
# total_mice_drug_df.plot.bar(x='Drug Regimen',y='Number of Mice', color='r', alpha=0.5, align="center")
drug_mice_chart = mice_count.plot(kind="bar", title="Total Number of Mice per Treatment")
drug_mice_chart.set_xlabel("Drug Regimen")
drug_mice_chart.set_ylabel("Total Number of Mice")

plt.show()
plt.tight_layout()
# plt.bar(x_axis, users, color='r', alpha=0.5, align="center")

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
mice_count=[230, 178, 178, 188, 186, 181, 156, 228, 181, 182]
x_axis = np.arange(len(mice_count))
plt.bar(x_axis, mice_count, color='r', alpha=0.5, align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ["Capomulin", "Ceftamin","Infubinol", "Ketapril", "Naftisol", "Placebo", "Propriva", "Ramicane",
 "Stelasyn", "Zoniferol"], rotation= 'vertical')
# plt.xlim(-0.75, len(x_axis)-0.25)
plt.title("Total Number of Mice per Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Total Number of Mice")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

male_mice = clean_df.loc[clean_df["Sex"]== "Male"]
male_count= len(male_mice)
female_mice = clean_df.loc[clean_df["Sex"]== "Female"]
female_count= len(female_mice)
# Sex=["Male", "Female"]
colors=["green", "purple"]
gender=clean_df["Sex"]
gender_sum=gender.value_counts()
# gender_df= pd.DataFrame({"Male":male_count, "Female": female_count}, index=["Index"])
pie_plot = gender_df.plot.pie(subplots=True, figsize=(5, 5), colors=colors)


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
male_mice = clean_df.loc[clean_df["Sex"]== "Male"]
male_count= len(male_mice)
female_mice = clean_df.loc[clean_df["Sex"]== "Female"]
female_count= len(female_mice)
Gender=[male_count, female_count]
Labels=["Male", "Female"]
colors=["green", "purple"]
plt.title("Male vs Female Distribution")
plt.pie(Gender, labels=Labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=90)
plt.axis("equal")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
mouse_df=clean_df.groupby("Mouse ID")
greatest_timepoint= mouse_df["Timepoint"].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
tumor_df=pd.merge(clean_df, greatest_timepoint, on="Mouse ID")

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
Treatment=["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
Tumor_volume=[]

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
     for treatment in Treatments:
            if 
            Tumor_volume.append()
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
