# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [1]:
%matplotlib notebook

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_metadata_df = pd.DataFrame(mouse_metadata)
study_results_df = pd.DataFrame(study_results)
merge_data = pd.merge(mouse_metadata_df,study_results_df,how="left",on="Mouse ID")

# Display the data table for preview
merge_data



Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [2]:
# Checking the number of mice.
mouse = merge_data["Mouse ID"].value_counts()
mouse_count = len(mouse)
mouse_count

249

In [3]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = merge_data.loc[merge_data.duplicated(subset=['Mouse ID','Timepoint']),'Mouse ID'].unique()
duplicate_mice

array(['g989'], dtype=object)

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 


In [4]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
new_df = merge_data[merge_data['Mouse ID'].isin(duplicate_mice)==False]
new_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Checking the number of mice in the clean DataFrame.
new_df_ID = new_df['Mouse ID'].value_counts()
new_df_count = len(new_df_ID)
new_df_count

248

## Summary Statistics

In [6]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.
new_df_mean = new_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
new_df_mean
new_df_median = new_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
new_df_median
new_df_var = new_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
new_df_var
new_df_sd = new_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
new_df_sd
new_df_sem = new_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()
new_df_sem


Drug Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.544332
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumor Volume (mm3), dtype: float64

In [7]:
df = pd.DataFrame({
    "Mean" : new_df_mean,
    "Median" : new_df_median,
    "Variance" :new_df_var,
    "Standard Deviation" : new_df_sd,
    "SEM" : new_df_sem
})
df

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [9]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
#https://www.educative.io/answers/how-to-make-bar-graphs-using-pandas
total_rows = merge_data.groupby(['Drug Regimen'])['Mouse ID'].count()
total_rows
bar_graph = total_rows.plot.bar(x='Drug Regimen',y=total_rows, color='g', align='center', fontsize=8)
plt.xlabel('Drug Regimen')
plt.ylabel('Mice Count')
plt.title('Mouse Regimen')
# Make the x-axis labels tight so they do not get cut off
#https://stackoverflow.com/questions/60658321/how-to-stop-the-x-axis-labels-from-getting-cut-off-from-the-bottom-of-the-bar-gr
plt.tight_layout()

plt.savefig("./Figures/Regiment_Treatment.png")

total_rows

<IPython.core.display.Javascript object>

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     161
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Mouse ID, dtype: int64

In [13]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
array = (merge_data.groupby(['Drug Regimen'])['Mouse ID'].count()).tolist() 

x_axis = np.arange(len(array))

plt.bar(x_axis,array,color='b',alpha=0.8, align='center')

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations,array)
# Set the limits of the x axis
plt.xlim(-0.75, len(x_axis)-0.25)
# Set the limits of the y axis
plt.ylim(0, max(array)+10)

# Give the chart a title, x label, and y label
plt.title("Drug Regimen Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Mouse Count")
plt.savefig("./Figures/Regiment_Treatment_pyplot.png")

print(array)
print(x_axis)


<IPython.core.display.Javascript object>

[230, 178, 178, 188, 186, 181, 161, 228, 181, 182]
[0 1 2 3 4 5 6 7 8 9]


In [14]:
# # Generate a pie plot showing the distribution of female versus male mice using Pandas
#  Group by gender and get the number to plot
gender = merge_data.groupby(["Mouse ID","Sex"])
gender_df = pd.DataFrame(gender.size())

mouse_gender = pd.DataFrame(gender_df.groupby(["Sex"]).count())
mouse_gender.columns = ["Count"]

colors = ["blue", 'red']
explode = (0.1, 0)
plot = mouse_gender.plot.pie(y='Count',figsize=(15,10), explode = explode, colors = colors, autopct="%1.1f%%",shadow=True,startangle=140)

plt.title("Distribution of Gender")
plt.axis("equal")

print(mouse_gender)

plt.savefig("./Figures/Pie.png")

<IPython.core.display.Javascript object>

        Count
Sex          
Female    124
Male      125


In [15]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

genders = ['Male','Female']
sizes = [124,125]
color = ["blue","red"]
explode = (0.1,0)

plt.pie(sizes,explode=explode,labels=genders,colors=color,autopct="%1.1f%%",shadow=True,startangle=140)
plt.axis("equal")
# Give the chart a title, x label, and y label
plt.title("Distribution of Gender")

plt.savefig("./Figures/Pie_py.png")

<IPython.core.display.Javascript object>

In [16]:
sizes

[124, 125]

## Quartiles, Outliers and Boxplots

In [17]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
Capomulin = merge_data.loc[merge_data['Drug Regimen'] == "Capomulin"]
Ramicane = merge_data.loc[merge_data['Drug Regimen'] == "Ramicane"]
Infubinol = merge_data.loc[merge_data['Drug Regimen'] == "Infubinol"]
Ceftamin = merge_data.loc[merge_data['Drug Regimen'] == "Ceftamin"]


# Start by getting the last (greatest) timepoint for each mouse
Capomulin_last = Capomulin.groupby('Mouse ID')['Timepoint'].max()
Ramicane_last = Ramicane.groupby('Mouse ID')['Timepoint'].max()
Infubinol_last = Infubinol.groupby('Mouse ID')['Timepoint'].max()
Ceftamin_last = Ceftamin.groupby('Mouse ID')['Timepoint'].max()

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
Capomulin_df = pd.DataFrame(Capomulin_last)
Ramicane_df = pd.DataFrame(Ramicane_last)
Infubinol_df = pd.DataFrame(Infubinol_last)
Ceftamin_df = pd.DataFrame(Ceftamin_last)

Capomulin_merge = pd.merge(Capomulin_df,merge_data, on=('Mouse ID','Timepoint'),how="left")
Ramicane_merge = pd.merge(Ramicane_df,merge_data, on=('Mouse ID','Timepoint'),how="left")
Infubinol_merge = pd.merge(Infubinol_df,merge_data, on=('Mouse ID','Timepoint'),how="left")
Ceftamin_merge = pd.merge(Ceftamin_df,merge_data, on=('Mouse ID','Timepoint'),how="left")

Capomulin_merge

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,b128,45,Capomulin,Female,9,22,38.982878,2
1,b742,45,Capomulin,Male,7,21,38.939633,0
2,f966,20,Capomulin,Male,16,17,30.485985,0
3,g288,45,Capomulin,Male,3,19,37.074024,1
4,g316,45,Capomulin,Female,22,22,40.15922,2
5,i557,45,Capomulin,Female,1,24,47.685963,1
6,i738,45,Capomulin,Female,23,20,37.311846,2
7,j119,45,Capomulin,Female,7,23,38.125164,1
8,j246,35,Capomulin,Female,21,21,38.753265,1
9,l509,45,Capomulin,Male,17,21,41.483008,3


In [18]:
   
    # Locate the rows which contain mice on each drug and get the tumor volumes
Capomulin_t = Capomulin_merge['Tumor Volume (mm3)']
Ramicane_t = Ramicane_merge['Tumor Volume (mm3)']
Infubinol_t = Infubinol_merge['Tumor Volume (mm3)']
Ceftamin_t = Ceftamin_merge['Tumor Volume (mm3)']

    
    # Determine outliers using upper and lower bounds
#Capomulin Q
Capomulin_quartiles = Capomulin_t.quantile([.25,.5,.75])
capo_lowerq = Capomulin_quartiles[.25]
capo_upperq = Capomulin_quartiles[.75]
Capomulin_iqr = capo_upperq-capo_lowerq
#Ramicane Q
Ramicane_quartiles = Ramicane_t.quantile([.25,.5,.75])
rami_lowerq = Ramicane_quartiles[.25]
rami_upperq = Ramicane_quartiles[.75]
Ramicane_iqr = rami_upperq-rami_lowerq
#Infubinol
Infubinol_quartiles = Infubinol_t.quantile([.25,.5,.75])
infu_lowerq = Infubinol_quartiles[.25]
infu_upperq = Infubinol_quartiles[.75]
Infubinol_iqr = infu_upperq-infu_lowerq
#Ceftamin
Ceftamin_quartiles = Ceftamin_t.quantile([.25,.5,.75])
ceft_lowerq = Ceftamin_quartiles[.25]
ceft_upperq = Ceftamin_quartiles[.75]
Ceftamin_iqr = ceft_upperq-ceft_lowerq

#Capomulin Print
print(f"The lower quartile of Capomulin drug regimen is: {capo_lowerq}")
print(f"The upper quartile of Capomulin drug regimen is: {capo_upperq}")
print(f"The interquartile range of Capomulin drug regimen is: {Capomulin_iqr}")
print(f"The median of Capomulin drug regimen is: {Capomulin_quartiles[.5]}")

capo_lower_bound = capo_lowerq - (1.5*Capomulin_iqr)
capo_upper_bound = capo_upperq + (1.5*Capomulin_iqr)
print(f"Values below {capo_lower_bound} could be outliers.")
print(f"Values above {capo_lower_bound} could be outliers.")
print("\n")
print("---------------------------------------------")
print("\n")

#Ramicane Print
print(f"The lower quartile of Ramicane drug regimen is: {rami_lowerq}")
print(f"The upper quartile of Ramicane drug regimen is: {rami_upperq}")
print(f"The interquartile range of Ramicane drug regimen is: {Ramicane_iqr}")
print(f"The median of Ramicane drug regimen is: {Ramicane_quartiles[.5]}")

rami_lower_bound = rami_lowerq - (1.5*Ramicane_iqr)
rami_upper_bound = rami_upperq + (1.5*Ramicane_iqr)
print(f"Values below {rami_lower_bound} could be outliers.")
print(f"Values above {rami_lower_bound} could be outliers.")
print("\n")
print("---------------------------------------------")
print("\n")

#Infubinol Print
print(f"The lower quartile of Infubinol drug regimen is: {infu_lowerq}")
print(f"The upper quartile of Infubinol drug regimen is: {infu_upperq}")
print(f"The interquartile range of Infubinol drug regimen is: {Infubinol_iqr}")
print(f"The median of Infubinol drug regimen is: {Infubinol_quartiles[.5]}")

infu_lower_bound = infu_lowerq - (1.5*Infubinol_iqr)
infu_upper_bound = infu_upperq + (1.5*Infubinol_iqr)
print(f"Values below {infu_lower_bound} could be outliers.")
print(f"Values above {infu_lower_bound} could be outliers.")
print("\n")
print("---------------------------------------------")
print("\n")

#Ceftamin Print
print(f"The lower quartile of Capomulin drug regimen is: {ceft_lowerq}")
print(f"The upper quartile of Capomulin drug regimen is: {ceft_upperq}")
print(f"The interquartile range of Capomulin drug regimen is: {Ceftamin_iqr}")
print(f"The median of Capomulin drug regimen is: {Ceftamin_quartiles[.5]}")

ceft_lower_bound = ceft_lowerq - (1.5*Ceftamin_iqr)
ceft_upper_bound = ceft_upperq + (1.5*Ceftamin_iqr)
print(f"Values below {ceft_lower_bound} could be outliers.")
print(f"Values above {ceft_lower_bound} could be outliers.")





The lower quartile of Capomulin drug regimen is: 32.37735684
The upper quartile of Capomulin drug regimen is: 40.1592203
The interquartile range of Capomulin drug regimen is: 7.781863460000004
The median of Capomulin drug regimen is: 38.1251644
Values below 20.70456164999999 could be outliers.
Values above 20.70456164999999 could be outliers.


---------------------------------------------


The lower quartile of Ramicane drug regimen is: 31.56046955
The upper quartile of Ramicane drug regimen is: 40.65900627
The interquartile range of Ramicane drug regimen is: 9.098536719999998
The median of Ramicane drug regimen is: 36.56165229
Values below 17.912664470000003 could be outliers.
Values above 17.912664470000003 could be outliers.


---------------------------------------------


The lower quartile of Infubinol drug regimen is: 54.04860769
The upper quartile of Infubinol drug regimen is: 65.52574285
The interquartile range of Infubinol drug regimen is: 11.477135160000003
The median of I

In [19]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
fig1,ax1 = plt.subplots()
ax1.set_title('Tumors')

ax1.boxplot([Capomulin_t,Ramicane_t,Infubinol_t,Ceftamin_t])
plt.show()
plt.savefig("./Figures/Boxplot.png")

<IPython.core.display.Javascript object>

## Line and Scatter Plots

In [18]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin


In [36]:
Capomulin_df = new_df.loc[new_df['Drug Regimen'] =='Capomulin',:]

s_185 = Capomulin_df.loc[Capomulin_df['Mouse ID'] == 's185',:]

timepoint = s_185['Timepoint']
tumor_volume = s_185['Tumor Volume (mm3)']


fig1, ax1 = plt.subplots()
plt.title('Capomulin treatmeant of mouse s185')
plt.plot(timepoint,tumor_volume,linewidth=2,color="royalblue", label="Fahreneit")
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')
plt.savefig('./Figures/Line Plot.png')

Capomulin_df.head()


<IPython.core.display.Javascript object>

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
10,s185,Capomulin,Female,3,17,0,45.0,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0


In [65]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
# Generate the average of Mouse ID using groupby
avg_tumor = Capomulin_df.groupby(['Mouse ID','Weight (g)'])['Tumor Volume (mm3)'].mean().reset_index()
# Rename the column
avg_tumor.rename(columns={'Tumor Volume (mm3)': 'Avg Tumor Volume (mm3)'}, inplace=True)
avg_tumor
fig1, ax1 = plt.subplots()
plt.scatter(avg_tumor['Weight (g)'],avg_tumor['Avg Tumor Volume (mm3)'], color="darkblue")
plt.xlabel('Weight (g)')
plt.ylabel('Averag Tumor Volume (mm3)')
plt.title('Mouse Weight Vs Average Observed Tumor')
plt.savefig("./Figures/Scatter Plot.png")

<IPython.core.display.Javascript object>

## Correlation and Regression

In [71]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
correl = st.pearsonr(avg_tumor['Weight (g)'],avg_tumor['Avg Tumor Volume (mm3)'])

(slope, intercept,rvalue, pvalue, stderr)=st.linregress(avg_tumor["Weight (g)"],avg_tumor["Avg Tumor Volume (mm3)"])
regress_values=avg_tumor["Weight (g)"]* slope + intercept
line_eq= f"y = {round(slope, 2)} x + {round(intercept, 2)}"

plt.scatter(avg_tumor["Weight (g)"],avg_tumor["Avg Tumor Volume (mm3)"],color='royalblue')
plt.plot(avg_tumor["Weight (g)"], regress_values, color='red')
plt.annotate(line_eq,(20,36), fontsize=14)
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title("Weight vs Tumor Volume for Capomulin")
plt.show()
plt.savefig("./Figures/Correlation Coefficient.png")

<IPython.core.display.Javascript object>