## Observations and Insights 

In [1]:
%matplotlib widget

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [3]:
mouse_metadata.shape

(249, 5)

In [4]:
study_results = pd.read_csv(study_results_path)
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [5]:
study_results.shape

(1893, 4)

In [6]:
# Combine the data into a single dataset
combined_df = pd.merge(mouse_metadata,study_results,left_on='Mouse ID',right_on='Mouse ID',how='left')

# Display the data table for preview
combined_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [7]:
# Checking the number of mice.
pd.value_counts(combined_df['Mouse ID'])

g989    13
e662    10
b879    10
o795    10
g288    10
        ..
v199     1
h428     1
x226     1
u153     1
f932     1
Name: Mouse ID, Length: 249, dtype: int64

In [8]:
combined_df['Mouse ID'].unique()

array(['k403', 's185', 'x401', 'm601', 'g791', 's508', 'f966', 'm546',
       'z578', 'j913', 'u364', 'n364', 'y793', 'r554', 'm957', 'c758',
       't565', 'a644', 'i177', 'j989', 'i738', 'a520', 'w914', 'r811',
       'g288', 'i334', 'q610', 'd251', 'l897', 'c458', 'b742', 'b128',
       'j246', 'a411', 'j119', 'w150', 'v923', 'g316', 's710', 'l509',
       'r944', 'e662', 'u196', 'q597', 'a444', 'i557', 'r921', 'w678',
       'y449', 'a203', 'a251', 'a262', 'a275', 'a366', 'a401', 'a457',
       'a492', 'a577', 'a685', 'a699', 'a788', 'a818', 'a897', 'a963',
       'b313', 'b447', 'b487', 'b559', 'b759', 'b879', 'c139', 'c264',
       'c282', 'c302', 'c326', 'c402', 'c559', 'c580', 'c757', 'c766',
       'c819', 'c832', 'c895', 'c927', 'd133', 'd164', 'd474', 'e213',
       'e227', 'e291', 'e476', 'e584', 'f129', 'f234', 'f278', 'f345',
       'f394', 'f436', 'f545', 'f932', 'f993', 'g107', 'g296', 'g497',
       'g558', 'g570', 'g867', 'g989', 'h246', 'h333', 'h428', 'h531',
      

In [9]:
#Count for the entire dataframe
combined_df.apply(pd.value_counts).fillna(0)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,0.0,0.0,0.0,0.0,0.0,250.0,0.0,793.0
1,0.0,0.0,0.0,60.0,0.0,0.0,0.0,590.0
2,0.0,0.0,0.0,87.0,0.0,0.0,0.0,274.0
3,0.0,0.0,0.0,107.0,0.0,0.0,0.0,148.0
4,0.0,0.0,0.0,74.0,0.0,0.0,0.0,88.0
...,...,...,...,...,...,...,...,...
z435,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
z578,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
z581,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
z795,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_df = combined_df[combined_df.duplicated(['Mouse ID','Timepoint'])]
duplicate_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [11]:
# Optional: Get all the data for the duplicate mouse ID. 
all_duplicated = combined_df[combined_df.duplicated(['Mouse ID'])]
all_duplicated

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [12]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combined_df.drop_duplicates(['Mouse ID','Timepoint']) 
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [13]:
# Checking the number of mice in the clean DataFrame.
pd.value_counts(clean_df['Mouse ID'])


x773    10
p136    10
o795    10
g288    10
i334    10
        ..
v199     1
x226     1
h428     1
u153     1
f932     1
Name: Mouse ID, Length: 249, dtype: int64

In [14]:
clean_df.shape

(1888, 8)

## Summary Statistics


In [15]:
#Change names of Headers
clean_df = clean_df.rename(columns = {"Drug Regimen":"Drug_Regimen"})
clean_df = clean_df.rename(columns = {"Tumor Volume (mm3)":"Tumor_Volume"})
clean_df = clean_df.rename(columns = {"Mouse ID":"Mouse_ID"})
clean_df = clean_df.rename(columns = {"Metastatic Sites":"Metastatic_Sites"})
print (clean_df)

     Mouse_ID Drug_Regimen   Sex  Age_months  Weight (g)  Timepoint  \
0        k403     Ramicane  Male          21          16          0   
1        k403     Ramicane  Male          21          16          5   
2        k403     Ramicane  Male          21          16         10   
3        k403     Ramicane  Male          21          16         15   
4        k403     Ramicane  Male          21          16         20   
...       ...          ...   ...         ...         ...        ...   
1888     z969     Naftisol  Male           9          30         25   
1889     z969     Naftisol  Male           9          30         30   
1890     z969     Naftisol  Male           9          30         35   
1891     z969     Naftisol  Male           9          30         40   
1892     z969     Naftisol  Male           9          30         45   

      Tumor_Volume  Metastatic_Sites  
0        45.000000                 0  
1        38.825898                 0  
2        35.014271            

In [16]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.
pd.value_counts(clean_df['Drug_Regimen'])

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Placebo      181
Stelasyn     181
Infubinol    178
Ceftamin     178
Propriva     156
Name: Drug_Regimen, dtype: int64

In [17]:
type(clean_df['Drug_Regimen'])

pandas.core.series.Series

In [18]:
#Series mean
clean_df[clean_df.Drug_Regimen=='Zoniferol'].Tumor_Volume.mean() 

53.236506551593415

In [19]:
clean_df[clean_df.Drug_Regimen=='Capomulin'].Tumor_Volume.mean() 

40.67574114100001

In [20]:
clean_df[clean_df.Drug_Regimen=='Ceftamin'].Tumor_Volume.mean()    

52.59117180960677

In [21]:
clean_df[clean_df.Drug_Regimen=='Placebo'].Tumor_Volume.mean() 

54.03358078635358

In [22]:
clean_df[clean_df.Drug_Regimen=='Infubinol'].Tumor_Volume.mean() 

52.88479510859551

In [23]:
clean_df[clean_df.Drug_Regimen=='Ketapril'].Tumor_Volume.mean() 

55.23563764047869

In [24]:
clean_df[clean_df.Drug_Regimen=='Ramicane'].Tumor_Volume.mean() 

40.2167450667105

In [25]:
clean_df[clean_df.Drug_Regimen=='Naftisol'].Tumor_Volume.mean() 

54.331564658333306

In [26]:
clean_df[clean_df.Drug_Regimen=='Propriva'].Tumor_Volume.mean() 

52.39346338487179

In [27]:
clean_df[clean_df.Drug_Regimen=='Stelasyn'].Tumor_Volume.mean() 

54.23314911988949

In [28]:
#Series median
clean_df[clean_df.Drug_Regimen=='Zoniferol'].Tumor_Volume.median()

51.818479325

In [29]:
clean_df[clean_df.Drug_Regimen=='Capomulin'].Tumor_Volume.median()

41.557808879999996

In [30]:
clean_df[clean_df.Drug_Regimen=='Ceftamin'].Tumor_Volume.median()    

51.77615728000001

In [31]:
clean_df[clean_df.Drug_Regimen=='Placebo'].Tumor_Volume.median() 

52.28893409

In [32]:
clean_df[clean_df.Drug_Regimen=='Infubinol'].Tumor_Volume.median()

51.82058438

In [33]:
clean_df[clean_df.Drug_Regimen=='Ketapril'].Tumor_Volume.median() 

53.698742644999996

In [34]:
clean_df[clean_df.Drug_Regimen=='Ramicane'].Tumor_Volume.median()

40.67323554

In [35]:
clean_df[clean_df.Drug_Regimen=='Naftisol'].Tumor_Volume.median() 

52.509284609999995

In [36]:
clean_df[clean_df.Drug_Regimen=='Propriva'].Tumor_Volume.median() 

50.909964985

In [37]:
clean_df[clean_df.Drug_Regimen=='Stelasyn'].Tumor_Volume.median() 

52.43173664

In [38]:
#Series var
clean_df[clean_df.Drug_Regimen=='Zoniferol'].Tumor_Volume.var()

48.53335538938606

In [39]:
clean_df[clean_df.Drug_Regimen=='Capomulin'].Tumor_Volume.var()

24.947764120254856

In [40]:
clean_df[clean_df.Drug_Regimen=='Ceftamin'].Tumor_Volume.var()    

39.2901772732786

In [41]:
clean_df[clean_df.Drug_Regimen=='Placebo'].Tumor_Volume.var() 

61.16808293669701

In [42]:
clean_df[clean_df.Drug_Regimen=='Infubinol'].Tumor_Volume.var()

43.12868412883606

In [43]:
clean_df[clean_df.Drug_Regimen=='Ketapril'].Tumor_Volume.var() 

68.55357711244596

In [44]:
clean_df[clean_df.Drug_Regimen=='Ramicane'].Tumor_Volume.var()

23.486703952095255

In [45]:
clean_df[clean_df.Drug_Regimen=='Naftisol'].Tumor_Volume.var() 

66.17347898736509

In [46]:
clean_df[clean_df.Drug_Regimen=='Propriva'].Tumor_Volume.var() 

43.138803497801035

In [47]:
clean_df[clean_df.Drug_Regimen=='Stelasyn'].Tumor_Volume.var() 

59.45056167336598

In [48]:
#Series std
clean_df[clean_df.Drug_Regimen=='Zoniferol'].Tumor_Volume.std()

6.966588504381901

In [49]:
clean_df[clean_df.Drug_Regimen=='Capomulin'].Tumor_Volume.std()

4.9947736805840215

In [50]:
clean_df[clean_df.Drug_Regimen=='Ceftamin'].Tumor_Volume.std()    

6.268187718414199

In [51]:
clean_df[clean_df.Drug_Regimen=='Placebo'].Tumor_Volume.std() 

7.821002681031187

In [52]:
clean_df[clean_df.Drug_Regimen=='Infubinol'].Tumor_Volume.std()

6.5672432670669405

In [53]:
clean_df[clean_df.Drug_Regimen=='Ketapril'].Tumor_Volume.std() 

8.279708757706757

In [54]:
clean_df[clean_df.Drug_Regimen=='Ramicane'].Tumor_Volume.std()

4.846308280753016

In [55]:
clean_df[clean_df.Drug_Regimen=='Naftisol'].Tumor_Volume.std() 

8.13470829147334

In [56]:
clean_df[clean_df.Drug_Regimen=='Propriva'].Tumor_Volume.std() 

6.56801366455651

In [57]:
clean_df[clean_df.Drug_Regimen=='Stelasyn'].Tumor_Volume.std() 

7.7104190335782645

In [58]:
#Series sem
clean_df[clean_df.Drug_Regimen=='Zoniferol'].Tumor_Volume.sem()

0.5163978968332167

In [59]:
clean_df[clean_df.Drug_Regimen=='Capomulin'].Tumor_Volume.sem()

0.32934562340083096

In [60]:
clean_df[clean_df.Drug_Regimen=='Ceftamin'].Tumor_Volume.sem()    

0.469820532752611

In [61]:
clean_df[clean_df.Drug_Regimen=='Placebo'].Tumor_Volume.sem() 

0.5813305510593875

In [62]:
clean_df[clean_df.Drug_Regimen=='Infubinol'].Tumor_Volume.sem()

0.4922356938011383

In [63]:
clean_df[clean_df.Drug_Regimen=='Ketapril'].Tumor_Volume.sem() 

0.6038598237739696

In [64]:
clean_df[clean_df.Drug_Regimen=='Ramicane'].Tumor_Volume.sem()

0.3209546065084816

In [65]:
clean_df[clean_df.Drug_Regimen=='Naftisol'].Tumor_Volume.sem() 

0.5964657512424236

In [66]:
clean_df[clean_df.Drug_Regimen=='Propriva'].Tumor_Volume.sem() 

0.5258619511360094

In [67]:
clean_df[clean_df.Drug_Regimen=='Stelasyn'].Tumor_Volume.sem() 

0.5731109332771458

In [68]:
#Generate Summary Table with Calculated Values
Summary_Table1 = clean_df.pivot_table(values ='Tumor_Volume', index = 'Drug_Regimen', aggfunc = ['mean', 'median', 'var', 'std', 'sem'])
Summary_Table1

Unnamed: 0_level_0,mean,median,var,std,sem
Unnamed: 0_level_1,Tumor_Volume,Tumor_Volume,Tumor_Volume,Tumor_Volume,Tumor_Volume
Drug_Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [69]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function
clean_df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892],
           dtype='int64', length=1888)

In [70]:
mean = clean_df.groupby('Drug_Regimen').Tumor_Volume.mean()
mean

Drug_Regimen
Capomulin    40.675741
Ceftamin     52.591172
Infubinol    52.884795
Ketapril     55.235638
Naftisol     54.331565
Placebo      54.033581
Propriva     52.393463
Ramicane     40.216745
Stelasyn     54.233149
Zoniferol    53.236507
Name: Tumor_Volume, dtype: float64

In [71]:
median=clean_df.groupby('Drug_Regimen').Tumor_Volume.median()
median

Drug_Regimen
Capomulin    41.557809
Ceftamin     51.776157
Infubinol    51.820584
Ketapril     53.698743
Naftisol     52.509285
Placebo      52.288934
Propriva     50.909965
Ramicane     40.673236
Stelasyn     52.431737
Zoniferol    51.818479
Name: Tumor_Volume, dtype: float64

In [72]:
var = clean_df.groupby('Drug_Regimen').Tumor_Volume.var()
var

Drug_Regimen
Capomulin    24.947764
Ceftamin     39.290177
Infubinol    43.128684
Ketapril     68.553577
Naftisol     66.173479
Placebo      61.168083
Propriva     43.138803
Ramicane     23.486704
Stelasyn     59.450562
Zoniferol    48.533355
Name: Tumor_Volume, dtype: float64

In [73]:
std = clean_df.groupby('Drug_Regimen').Tumor_Volume.std()
std

Drug_Regimen
Capomulin    4.994774
Ceftamin     6.268188
Infubinol    6.567243
Ketapril     8.279709
Naftisol     8.134708
Placebo      7.821003
Propriva     6.568014
Ramicane     4.846308
Stelasyn     7.710419
Zoniferol    6.966589
Name: Tumor_Volume, dtype: float64

In [74]:
sem = clean_df.groupby('Drug_Regimen').Tumor_Volume.sem()
sem

Drug_Regimen
Capomulin    0.329346
Ceftamin     0.469821
Infubinol    0.492236
Ketapril     0.603860
Naftisol     0.596466
Placebo      0.581331
Propriva     0.525862
Ramicane     0.320955
Stelasyn     0.573111
Zoniferol    0.516398
Name: Tumor_Volume, dtype: float64

In [75]:
#Generate Summary Table with Calculated Values
Summary_Table2 = clean_df.pivot_table(values ='Tumor_Volume', index = 'Drug_Regimen', aggfunc = ['mean', 'median', 'var', 'std', 'sem'])
Summary_Table2

Unnamed: 0_level_0,mean,median,var,std,sem
Unnamed: 0_level_1,Tumor_Volume,Tumor_Volume,Tumor_Volume,Tumor_Volume,Tumor_Volume
Drug_Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [76]:
#Count the total number of mice during the treatment
pd.value_counts(clean_df['Drug_Regimen'])

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Placebo      181
Stelasyn     181
Infubinol    178
Ceftamin     178
Propriva     156
Name: Drug_Regimen, dtype: int64

In [77]:
#Create a list
Treatment =["Capomulin", "Ramicane", "Ketapril","Naftisol","Zoniferol", "Placebo", "Stelasyn", "Infubinol","Ceftamin","Propriva"]
Number_mice =[230,228,188,186,182,181,181,178,178,156]

In [78]:
#Create a dictionary
data_dict ={'Treatment':pd.Series(Treatment),'Number_mice':pd.Series(Number_mice)}

In [79]:
#Create a pandas DataFrame from Dictionary
dframe=pd.DataFrame(data_dict)
dframe

Unnamed: 0,Treatment,Number_mice
0,Capomulin,230
1,Ramicane,228
2,Ketapril,188
3,Naftisol,186
4,Zoniferol,182
5,Placebo,181
6,Stelasyn,181
7,Infubinol,178
8,Ceftamin,178
9,Propriva,156


In [80]:
dframe.index

RangeIndex(start=0, stop=10, step=1)

In [81]:
# Set the index to be "Treatment" so they will be used as labels
dframe = dframe.set_index("Treatment")

dframe.head()

Unnamed: 0_level_0,Number_mice
Treatment,Unnamed: 1_level_1
Capomulin,230
Ramicane,228
Ketapril,188
Naftisol,186
Zoniferol,182


In [82]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.
# Use DataFrame.plot() in order to create a bar chart of the data
dframe.plot(kind="bar", figsize=(10,8))

# Set a title for the chart
plt.title("Number of Mice vs Treatment")

plt.show()
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [83]:
plt.savefig("../Images/Bar1.png")

In [84]:
# Matplotlib
# Set x axis and tick locations
Treatment =["Capomulin", "Ramicane", "Ketapril","Naftisol","Zoniferol", "Placebo", "Stelasyn", "Infubinol","Ceftamin","Propriva"]
Number_mice =[230,228,188,186,182,181,181,178,178,156]
x_axis = np.arange(len(Number_mice))

In [85]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
plt.figure(figsize=(12,10))
plt.bar(x_axis, Number_mice, color="b", align="center")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<BarContainer object of 10 artists>

In [86]:
# Create the ticks for our bar chart's x axis
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, Treatment, rotation="vertical")

([<matplotlib.axis.XTick at 0x23dbfdae828>,
  <matplotlib.axis.XTick at 0x23dbfdae160>,
  <matplotlib.axis.XTick at 0x23dbfda2e48>,
  <matplotlib.axis.XTick at 0x23dbfdee4e0>,
  <matplotlib.axis.XTick at 0x23dbfdee9e8>,
  <matplotlib.axis.XTick at 0x23dbfdeee48>,
  <matplotlib.axis.XTick at 0x23dbfdf7358>,
  <matplotlib.axis.XTick at 0x23dbfdf7828>,
  <matplotlib.axis.XTick at 0x23dbfdf7cf8>,
  <matplotlib.axis.XTick at 0x23dbfe00208>],
 <a list of 10 Text xticklabel objects>)

In [87]:
#Set the limits of the x axis
plt.xlim(-0.75, len(x_axis))

(-0.75, 10)

In [88]:
#Set the limits for y axis
plt.ylim(0, max(Number_mice)+10)

(0, 240)

In [89]:
#Give the chart a tittle, x label, and y label
plt.title("Number of Mice vs Treatment")
plt.xlabel("Treatment")
plt.ylabel("Number of Mice")

Text(0, 0.5, 'Number of Mice')

In [90]:
# Save an image of the chart and print it to the screen
plt.savefig("../Images/Bar2.png")
plt.show()

In [91]:
# Pie Charts

In [92]:
#Count the total number of female and male mice during the treatment
pd.value_counts(clean_df['Sex'])

Male      958
Female    930
Name: Sex, dtype: int64

In [93]:
# Create a list
Mice_sex = ["Female", "Male"]
Mice_number = [930,958]

In [94]:
#Create a dictionary
data_dictionary ={'Mice_sex':pd.Series(Mice_sex),'Mice_number':pd.Series(Mice_number)}

In [95]:
#Create a pandas DataFrame from Dictionary
dataframe=pd.DataFrame(data_dictionary)
dataframe

Unnamed: 0,Mice_sex,Mice_number
0,Female,930
1,Male,958


In [96]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
dataframe = pd.DataFrame({'Mice_number': [930, 958]},
                  index=['Female', 'Male'])
plot = dataframe.plot.pie(y='Mice_number', figsize=(5, 5), autopct="%1.1f%%")


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [97]:
# Set a title for the chart
plt.title("Pie Chart")

plt.show()
plt.tight_layout()

In [98]:
plt.savefig("../Images/Pie1.png")

In [99]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex = ["Female", "Male"]
Mice_number = [930,958]
colors = ["blue","orange"]
explode = (0.1,0)

In [100]:
# Tell matplotlib to create a pie chart based upon the above data
plt.figure(figsize=(5,5))
plt.pie(Mice_number, labels=sex, colors=colors, autopct="%1.1f%%")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

([<matplotlib.patches.Wedge at 0x23dc080d828>,
  <matplotlib.patches.Wedge at 0x23dc080def0>],
 [Text(0.02562299800632446, 1.099701533132135, 'Female'),
  Text(-0.025623100967812958, -1.099701530733132, 'Male')],
 [Text(0.01397618073072243, 0.5998371998902553, '49.3%'),
  Text(-0.01397623689153434, -0.5998371985817083, '50.7%')])

In [101]:
# Create axes which are equal so we have a perfect circle
plt.axis("equal")

(-1.1053783437113598,
 1.100256111605303,
 -1.103265408136953,
 1.1062425414441643)

In [102]:
#Give the chart a tittle
plt.title("Pie Chart")
plt.legend()

<matplotlib.legend.Legend at 0x23dc07c5908>

In [103]:
# Save an image of our chart and print the final product to the screen
plt.savefig("../Images/Pie2.png")
plt.show()

## Quartiles, Outliers and Boxplots

In [104]:
clean_df

Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor_Volume,Metastatic_Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [105]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
new_data=clean_df.drop_duplicates(subset='Mouse_ID', keep="last")
new_data

Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor_Volume,Metastatic_Sites
9,k403,Ramicane,Male,21,16,45,22.050126,1
19,s185,Capomulin,Female,3,17,45,23.343598,1
29,x401,Capomulin,Female,16,15,45,28.484033,0
39,m601,Capomulin,Male,22,17,45,28.430964,1
49,g791,Ramicane,Male,11,16,45,29.128472,1
...,...,...,...,...,...,...,...,...
1859,z314,Stelasyn,Female,21,28,5,45.934712,0
1862,z435,Propriva,Female,12,26,10,48.710661,0
1872,z581,Infubinol,Female,24,25,45,62.754451,3
1882,z795,Naftisol,Female,13,29,45,65.741070,3


In [106]:
# Change header 
new_data = new_data.rename(columns = {"Mouse_ID":"Mouse ID"})
new_data

Unnamed: 0,Mouse ID,Drug_Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor_Volume,Metastatic_Sites
9,k403,Ramicane,Male,21,16,45,22.050126,1
19,s185,Capomulin,Female,3,17,45,23.343598,1
29,x401,Capomulin,Female,16,15,45,28.484033,0
39,m601,Capomulin,Male,22,17,45,28.430964,1
49,g791,Ramicane,Male,11,16,45,29.128472,1
...,...,...,...,...,...,...,...,...
1859,z314,Stelasyn,Female,21,28,5,45.934712,0
1862,z435,Propriva,Female,12,26,10,48.710661,0
1872,z581,Infubinol,Female,24,25,45,62.754451,3
1882,z795,Naftisol,Female,13,29,45,65.741070,3


In [107]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
calculation = new_data.groupby(['Drug_Regimen', 'Mouse ID']).Tumor_Volume.max()
calculation

Drug_Regimen  Mouse ID
Capomulin     b128        38.982878
              b742        38.939633
              f966        30.485985
              g288        37.074024
              g316        40.159220
                            ...    
Zoniferol     q633        70.827796
              s337        62.109651
              w140        47.717952
              w575        68.401286
              x613        48.077646
Name: Tumor_Volume, Length: 249, dtype: float64

In [108]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
calculation.index

MultiIndex([('Capomulin', 'b128'),
            ('Capomulin', 'b742'),
            ('Capomulin', 'f966'),
            ('Capomulin', 'g288'),
            ('Capomulin', 'g316'),
            ('Capomulin', 'i557'),
            ('Capomulin', 'i738'),
            ('Capomulin', 'j119'),
            ('Capomulin', 'j246'),
            ('Capomulin', 'l509'),
            ...
            ('Zoniferol', 'm331'),
            ('Zoniferol', 'n967'),
            ('Zoniferol', 'o926'),
            ('Zoniferol', 'p136'),
            ('Zoniferol', 'q511'),
            ('Zoniferol', 'q633'),
            ('Zoniferol', 's337'),
            ('Zoniferol', 'w140'),
            ('Zoniferol', 'w575'),
            ('Zoniferol', 'x613')],
           names=['Drug_Regimen', 'Mouse ID'], length=249)

In [109]:
capomulin_df=calculation.loc['Capomulin']
capomulin_df = pd.DataFrame(capomulin_df)
capomulin_df

Unnamed: 0_level_0,Tumor_Volume
Mouse ID,Unnamed: 1_level_1
b128,38.982878
b742,38.939633
f966,30.485985
g288,37.074024
g316,40.15922
i557,47.685963
i738,37.311846
j119,38.125164
j246,38.753265
l509,41.483008


In [110]:
ramicane_df=calculation.loc['Ramicane']
ramicane_df = pd.DataFrame(ramicane_df)
ramicane_df

Unnamed: 0_level_0,Tumor_Volume
Mouse ID,Unnamed: 1_level_1
a411,38.407618
a444,43.047543
a520,38.810366
a644,32.978522
c458,38.342008
c758,33.397653
d251,37.311236
e662,40.659006
g791,29.128472
i177,33.562402


In [111]:
infubinol_df=calculation.loc['Infubinol']
infubinol_df = pd.DataFrame(infubinol_df)
infubinol_df

Unnamed: 0_level_0,Tumor_Volume
Mouse ID,Unnamed: 1_level_1
a203,67.973419
a251,65.525743
a577,57.031862
a685,66.083066
c139,72.226731
c326,36.321346
c895,60.969711
e476,62.435404
f345,60.918767
i386,67.289621


In [112]:
ceftamin_df=calculation.loc['Ceftamin']
ceftamin_df = pd.DataFrame(ceftamin_df)
ceftamin_df

Unnamed: 0_level_0,Tumor_Volume
Mouse ID,Unnamed: 1_level_1
a275,62.999356
b447,45.0
b487,56.057749
b759,55.742829
f436,48.722078
h531,47.784682
j296,61.849023
k210,68.923185
l471,67.748662
l490,57.918381


In [113]:
# Put treatments into a list for for loop (and later for plot labels)
Treatment_list=['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']  

In [114]:
# Create empty list to fill with tumor vol data (for plotting)
Tumor_Volume_data=[]

In [115]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
Capomulin = capomulin_df['Tumor_Volume']
Capomulin_quartiles = Capomulin.quantile([.25,.5,.75])
Capomulin_lowerq = Capomulin_quartiles[0.25]
Capomulin_upperq = Capomulin_quartiles[0.75]
Capomulin_iqr = Capomulin_upperq-Capomulin_lowerq

print(f"The lower quartile of Capomulin is: {Capomulin_lowerq}")
print(f"The upper quartile of Capomulin is: {Capomulin_upperq}")
print(f"The interquartile range of Capomulin is: {Capomulin_iqr}")
print(f"The the median of Capomulin is: {Capomulin_quartiles[0.5]} ")

Capomulin_lower_bound = Capomulin_lowerq - (1.5*Capomulin_iqr)
Capomulin_upper_bound = Capomulin_upperq + (1.5*Capomulin_iqr)
print(f"Values below {Capomulin_lower_bound} could be outliers.")
print(f"Values above {Capomulin_upper_bound} could be outliers.")

The lower quartile of Capomulin is: 32.37735684
The upper quartile of Capomulin is: 40.1592203
The interquartile range of Capomulin is: 7.781863460000004
The the median of Capomulin is: 38.125164399999996 
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [116]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
Ramicane = ramicane_df['Tumor_Volume']
Ramicane_quartiles = Ramicane.quantile([.25,.5,.75])
Ramicane_lowerq = Ramicane_quartiles[0.25]
Ramicane_upperq = Ramicane_quartiles[0.75]
Ramicane_iqr = Ramicane_upperq-Ramicane_lowerq

print(f"The lower quartile of Ramicane is: {Ramicane_lowerq}")
print(f"The upper quartile of Ramicane is: {Ramicane_upperq}")
print(f"The interquartile range of Ramicane is: {Ramicane_iqr}")
print(f"The the median of Ramicane is: {Ramicane_quartiles[0.5]} ")

Ramicane_lower_bound = Ramicane_lowerq - (1.5*Ramicane_iqr)
Ramicane_upper_bound = Ramicane_upperq + (1.5*Ramicane_iqr)
print(f"Values below {Ramicane_lower_bound} could be outliers.")
print(f"Values above {Ramicane_upper_bound} could be outliers.")

The lower quartile of Ramicane is: 31.56046955
The upper quartile of Ramicane is: 40.65900627
The interquartile range of Ramicane is: 9.098536719999998
The the median of Ramicane is: 36.56165229 
Values below 17.912664470000003 could be outliers.
Values above 54.30681135 could be outliers.


In [117]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
Infubinol = infubinol_df['Tumor_Volume']
Infubinol_quartiles = Infubinol.quantile([.25,.5,.75])
Infubinol_lowerq = Infubinol_quartiles[0.25]
Infubinol_upperq = Infubinol_quartiles[0.75]
Infubinol_iqr = Infubinol_upperq-Infubinol_lowerq

print(f"The lower quartile of Infubinol is: {Infubinol_lowerq}")
print(f"The upper quartile of Infubinol is: {Infubinol_upperq}")
print(f"The interquartile range of Infubinol is: {Infubinol_iqr}")
print(f"The the median of Infubinol is: {Infubinol_quartiles[0.5]} ")

Infubinol_lower_bound = Infubinol_lowerq - (1.5*Infubinol_iqr)
Infubinol_upper_bound = Infubinol_upperq + (1.5*Infubinol_iqr)
print(f"Values below {Infubinol_lower_bound} could be outliers.")
print(f"Values above {Infubinol_upper_bound} could be outliers.")

The lower quartile of Infubinol is: 54.04860769
The upper quartile of Infubinol is: 65.52574285
The interquartile range of Infubinol is: 11.477135160000003
The the median of Infubinol is: 60.16518046 
Values below 36.83290494999999 could be outliers.
Values above 82.74144559000001 could be outliers.


In [118]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
Ceftamin = ceftamin_df['Tumor_Volume']
Ceftamin_quartiles = Ceftamin.quantile([.25,.5,.75])
Ceftamin_lowerq = Ceftamin_quartiles[0.25]
Ceftamin_upperq = Ceftamin_quartiles[0.75]
Ceftamin_iqr = Ceftamin_upperq-Ceftamin_lowerq

print(f"The lower quartile of Ceftamin is: {Ceftamin_lowerq}")
print(f"The upper quartile of Ceftamin is: {Ceftamin_upperq}")
print(f"The interquartile range of Ceftamin is: {Ceftamin_iqr}")
print(f"The the median of Ceftamin is: {Ceftamin_quartiles[0.5]} ")

Ceftamin_lower_bound = Ceftamin_lowerq - (1.5*Ceftamin_iqr)
Ceftamin_upper_bound = Ceftamin_upperq + (1.5*Ceftamin_iqr)
print(f"Values below {Ceftamin_lower_bound} could be outliers.")
print(f"Values above {Ceftamin_upper_bound} could be outliers.")

The lower quartile of Ceftamin is: 48.72207785
The upper quartile of Ceftamin is: 64.29983003
The interquartile range of Ceftamin is: 15.577752179999997
The the median of Ceftamin is: 59.85195552 
Values below 25.355449580000002 could be outliers.
Values above 87.66645829999999 could be outliers.


In [119]:
# Add column to each dataframe
capomulin_final_df=capomulin_df['Drug_Regimen']="Capomulin"
capomulin_final_df

'Capomulin'

In [120]:
# Add column to each dataframe
ramicane_final_df= ramicane_df['Drug_Regimen']="Ramicane"
ramicane_final_df

'Ramicane'

In [121]:
# Add column to each dataframe
infubinol_final_df = infubinol_df['Drug_Regimen']="Infubinol"
infubinol_final_df

'Infubinol'

In [122]:
# Add column to each dataframe
ceftamin_final_df=ceftamin_df['Drug_Regimen']="Ceftamin"
ceftamin_final_df

'Ceftamin'

In [123]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
# Capomulin, Ramicane, Infubinol, and Ceftamin


In [124]:
fig1, ax1 = plt.subplots()
ax1.set_title('Capomulin Tumor Volume (mm3)')
ax1.boxplot(capomulin_df)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [None]:
fig1, ax1 = plt.subplots()
ax1.set_title('Ramicane Tumor Volume (mm3)')
ax1.boxplot(ramicane_df)

In [None]:
fig1, ax1 = plt.subplots()
ax1.set_title('Infubinol Tumor Volume (mm3)')
ax1.boxplot(infubinol_df)

In [None]:
fig1, ax1 = plt.subplots()
ax1.set_title('Ceftamine Tumor Volume (mm3)')
ax1.boxplot(ceftamine_df)


## Line and Scatter Plots

In [None]:
clean_df

In [None]:
# Create a dataframe for Capomulin data
capomulin_data = clean_df.loc[clean_df['Drug_Regimen'] == 'Capomulin']

# View the dataframe
capomulin_data

In [None]:
# Groupby timepoint and then get the mean of the tumor volume and 
# sem just for kicks (since no scientist would plot a line graph without error bars)

capomulin_time = capomulin_data.groupby(['Timepoint']).agg(\
        # Get the mean of the 'Tumor Volume (mm3)' column\
        Tumor_Vol_Mean=('Tumor_Volume', np.mean),\
        # Get the standard error of measurement (SEM) of the 'Tumor Volume (mm3)' column\
        Tumor_Vol_SEM=('Tumor_Volume', st.sem)\
).round(3)

# View the groupby dataframe 
capomulin_time.head(20)

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
time = list(capomulin_time.index.values)

In [None]:
# Use errorbar plot from matplotlib
plt.errorbar(time, 
             capomulin_time['Tumor_Vol_Mean'],
             yerr=capomulin_time['Tumor_Vol_SEM'],
             label="Time Series of Tumor Volume for Capomulin",
             fmt="bs--", 
             linewidth=3)


In [None]:
# Add the descriptive title, x labels and y labels
plt.title("Time Series of Tumor Volume for Capomulin")
plt.xlabel("Time in days")
plt.ylabel("Tumor Volume (mm3)")

In [None]:

# Set x and y limits 
plt.xlim(min(time)-max(time)*0.05, max(time)*1.05)
plt.ylim(min(capomulin_time['Tumor_Vol_Mean'])*0.95, max(capomulin_time['Tumor_Vol_Mean'])*1.05)
plt.rcParams["figure.figsize"] = [9,8]


plt.show()


In [None]:
# Save the figure
plt.savefig("../Images/LineChart.png")

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_data.head()

In [None]:
# Groupby Mouse ID using .agg() method 
capomulin_mouse_id = capomulin_data.groupby(['Mouse_ID']).agg(\
        #Mean of the 'Tumor Volume (mm3)' column\
        Mouse_weight=('Weight (g)', np.mean),\
        # SEM of the 'Tumor Volume (mm3)' column\
        Tumor_vol_mean=('Tumor_Volume', np.mean)\
).round(3)
capomulin_mouse_id.head()

In [None]:
#scatter plot
plt.scatter(
    capomulin_mouse_id['Mouse_weight'],
    capomulin_mouse_id['Tumor_vol_mean'],
    marker='o',
    facecolors='blue',
    edgecolors='black',
    s=capomulin_mouse_id['Tumor_vol_mean'],
    alpha=.75)

In [None]:
# Create a title, x label, and y label 
plt.title("Mouse weight vs. Avg. Tumor Volume")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")
# Save the figure
plt.savefig("../Images/ScatterWeightTumorVol.png")

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(capomulin_mouse_id['Mouse_weight'],capomulin_mouse_id['Tumor_vol_mean'])
print(f"The correlation between mouse weight and average tumor volume is {round(correlation[0],2)}")


In [None]:
# Print out the r-squared value along with the plot.
x_values = capomulin_mouse_id['Mouse_weight']
y_values = capomulin_mouse_id['Tumor_vol_mean']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
#line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
line_eq = f'y = {str(round(slope,2))}x + {str(round(intercept,2))}'
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(17,37),fontsize=15,color="black")
plt.title("Mouse weight vs. Avg. Tumor Volume")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")


# Save the figure
plt.savefig("../Images/ScatterRegression.png")

plt.show()

In [None]:
print(f"The r-squared is: {rvalue}")
print(f"The equation of the regression line is: {line_eq}")