In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Merge the two DataFrames on "Mouse ID"
merged_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the number of unique mice IDs
unique_mice = merged_data['Mouse ID'].nunique()
print(f"Number of unique mice IDs: {unique_mice}")

# Check for duplicate time points
duplicate_timepoints = merged_data[merged_data.duplicated(['Mouse ID', 'Timepoint'])]
print("Mouse ID with duplicate time points:")
print(duplicate_timepoints)

# Create a new DataFrame with duplicates removed
cleaned_data = merged_data.drop_duplicates(['Mouse ID', 'Timepoint'])

# Display the updated number of unique mice IDs
updated_unique_mice = cleaned_data['Mouse ID'].nunique()
print(f"Updated number of unique mice IDs: {updated_unique_mice}")

# Create a DataFrame of summary statistics
summary_statistics = cleaned_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', 'sem'])
print(summary_statistics)

# Bar Charts
bar_chart_pandas = cleaned_data['Drug Regimen'].value_counts().plot(kind='bar', title='Number of Mice per Drug Regimen (Pandas)')
plt.show()

bar_chart_matplotlib = cleaned_data['Drug Regimen'].value_counts()
plt.bar(bar_chart_matplotlib.index, bar_chart_matplotlib.values)
plt.title('Number of Mice per Drug Regimen (Matplotlib)')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice')
plt.xticks(rotation=45)
plt.show()

# Pie Charts
pie_chart_pandas = cleaned_data['Sex'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, title='Distribution of Mice by Sex (Pandas)')
plt.show()

pie_chart_matplotlib = cleaned_data['Sex'].value_counts()
plt.pie(pie_chart_matplotlib, labels=pie_chart_matplotlib.index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Mice by Sex (Matplotlib)')
plt.show()

# Calculate the final tumor volume of each mouse
final_tumor_volume = cleaned_data.groupby('Mouse ID')['Timepoint'].max().reset_index()
final_tumor_volume = pd.merge(final_tumor_volume, cleaned_data, on=['Mouse ID', 'Timepoint'])

# List of treatment names
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Empty list to hold tumor volume data
tumor_volume_data = []

# Loop through each drug in the treatment list
for treatment in treatments:
    tumor_volume_data.append(final_tumor_volume[final_tumor_volume['Drug Regimen'] == treatment]['Tumor Volume (mm3)'])

# Determine outliers
for i in range(len(treatments)):
    quartiles = tumor_volume_data[i].quantile([0.25, 0.5, 0.75])
    lower_bound = quartiles[0.25] - 1.5 * (quartiles[0.75] - quartiles[0.25])
    upper_bound = quartiles[0.75] + 1.5 * (quartiles[0.75] - quartiles[0.25])
    outliers = tumor_volume_data[i][(tumor_volume_data[i] < lower_bound) | (tumor_volume_data[i] > upper_bound)]
    print(f"Potential outliers for {treatments[i]}: {outliers}")

# Box Plot
plt.boxplot(tumor_volume_data, labels=treatments, flierprops=dict(markerfacecolor='r', marker='s'))
plt.title('Final Tumor Volume for Top Treatment Regimens')
plt.xlabel('Drug Regimen')
plt.ylabel('Final Tumor Volume (mm3)')
plt.show()

# Line Plot
capomulin_mouse = cleaned_data[(cleaned_data['Drug Regimen'] == 'Capomulin') & (cleaned_data['Mouse ID'] == 's185')]
plt.plot(capomulin_mouse['Timepoint'], capomulin_mouse['Tumor Volume (mm3)'], marker='o')
plt.title('Capomulin Treatment: Tumor Volume vs Timepoint for Mouse s185')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

# Scatter Plot
capomulin_data = cleaned_data[cleaned_data['Drug Regimen'] == 'Capomulin']
average_tumor_volume = capomulin_data.groupby('Mouse ID')['Tumor Volume (mm3)'].mean()
mouse_weight = capomulin_data.groupby('Mouse ID')['Weight (g)'].mean()
plt.scatter(mouse_weight, average_tumor_volume)
plt.title('Capomulin Treatment: Mouse Weight vs Average Tumor Volume')
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()

# Calculate the correlation coefficient and linear regression model
correlation = st.pearsonr(mouse_weight, average_tumor_volume)
print(f"Correlation between mouse weight and average tumor volume: {correlation[0]}")

# Linear regression model
slope, intercept, r_value, p_value, std_err = st.linregress(mouse_weight, average_tumor_volume)

# Plot the linear regression model
plt.scatter(mouse_weight, average_tumor_volume)
plt.plot(mouse_weight, slope * mouse_weight + intercept, color='red')
plt.title('Capomulin Treatment: Linear Regression Model')
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.show()
