# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [None]:
# mouse_metadata.head()
# mouse_metadata.info()

In [None]:
# study_results.head()
# study_results.info()

In [None]:
# Combine the data into a single DataFrame
df = pd.merge(mouse_metadata, study_results, on="Mouse ID",  how="left")
# Display the data table for preview
df.head()

In [None]:
# Checking the number of mice.
df["Mouse ID"].nunique()

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
df.set_index(["Mouse ID", "Timepoint"])


In [None]:
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. **Ask the Expert
dup_mice = df[df.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)]
print(dup_mice)


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = df.loc[df["Mouse ID"] != "g989"] 

In [None]:
# Checking the number of mice in the clean DataFrame.
updated_mice_count = clean_df["Mouse ID"].nunique()
updated_mice_count

In [None]:
# clean_df.head(25)
# clean_df.info

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

tumor_mean = clean_df.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].mean()
tumor_median = clean_df.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].median()
tumor_var = clean_df.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].var()
tumor_std = clean_df.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].std()
tumor_sem = clean_df.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].sem()

data = {
    "Mean Tumor Volume": tumor_mean,
    "Median Tumor Volume" : tumor_median,
    "Tumor Volume Variance" : tumor_var,
    "Tumor Volume Std. Dev." : tumor_std,
    "Tumor Volume Std. Err." : tumor_sem
    
}

summ1 = pd.DataFrame(data).reset_index()
summ1

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)
# Using the aggregation method, produce the same summary statistics in a single line
# Ask the expert
# summ = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].agg(["mean", "median", "var", "std", "sem"])
# summ
tumor_agg= {
    "Tumor Volume (mm3)": ["mean", "median", "var", "std", "sem"]
}
summ2 = clean_df.groupby(["Drug Regimen"]).agg(tumor_agg).reset_index()
summ2

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
clean_df["Drug Regimen"].value_counts().plot(kind="bar",xlabel=("Drug Regimen"),
ylabel=("# of Observed Mouse Timepoints")) 
plt.show

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
# data = clean_df["Drug Regimen"].value_counts()
# x = list(data.index)
# y = list(data.values)
# # plot

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
# clean_df["sex"].value_counts() # this is the data they want us to use


In [None]:
# alright. And then the same deal do a dot plot kind equals pi, slap a couple of things in there and then. Same deal data equals.
# Df, 2 sex value counts X equals the index. Y equals the values. Shove that into a plit pi.

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot


## Quartiles, Outliers and Boxplots

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(data=clean_df, x="Drug Regimen", y="Tumor Volume (mm3)")

In [None]:
# sns.violinplot(data=clean_df, x="Drug Regimen", y="Tumor Volume (mm3)", hue="Sex", split=True)
# plt.xticks(rotation=90)
# plt.show

In [None]:
# sns.violinplot(data=clean_df, y="Drug Regimen", x="Tumor Volume (mm3)", hue="Sex", split=True)
# plt.xticks(rotation=90)
# plt.show

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint

################### USE SEABORN
# What we want to do here is do something called a grouped box plot
# for each treatment, right for each drug. I want to create 4 box blocks, and I want to stick them next to each other. 
# So that's what I want to do, and that's really difficult to do and map what lip. So essentially, what they're doing here is you're refactoring the entire data set to get those 4 boxes, and then you can put them next to each other.
# Yeah. And this is known as a group to Box Spot.
# where you have multiple box plots like side by side, right


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
# Create empty list to fill with tumor vol data (for plotting)
# Calculate the IQR and quantitatively determine if there are any potential outliers.
# Locate the rows which contain mice on each drug and get the tumor volumes
# add subset 
# Determine outliers using upper and lower bounds


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
