In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/weight-vs-age-of-chicks-on-different-diets/ChickWeight.csv')

# Initial analysis:

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
sns.lmplot(data=df, x='Time', y='weight', hue='Diet')
plt.title('weight x time')

# Are the initial and final weights balanced to each diet group?

In [None]:
initial_df = df[df['Time'] == df['Time'].min()]
initial_df.head()

In [None]:
sns.countplot(data=initial_df, x='Diet')
#Why are there more chicken on diet 1?

In [None]:
sns.boxplot(data=initial_df, x='Diet', y='weight')

In [None]:
plt.title('Initial weight distribution')
sns.histplot(data=initial_df, x='weight', hue='Diet')

In [None]:
initial_df.groupby('Diet').describe()['weight']

## What can we conclude:
* Every chicken group had their min weight equal and the max weight really close
* The chicken group on diet 1 seem to be averagely fatter
* The diet 1 have 20 chickens while the other have only 10

## How have the chicken ended up the diet?

In [None]:
final_df = df[df['Time'] == df['Time'].max()]
final_df.head()

In [None]:
plt.title('Some chicken are missing')
sns.countplot(data=final_df, x='Diet')

In [None]:
sns.boxplot(data=final_df, x='Diet', y='weight')

In [None]:
plt.title('Final weight distribution')
sns.histplot(data=final_df, x='weight', hue='Diet')

The number of chickens on the final day reduced in group 1 and 4

# The problem of the missing chickens:
  * Some chicken disappeared on the last day (what happened to them, did they died? the diet influenced that?)
  * Is this disappearment biasing our analysis?

In [None]:
chick_df = df.groupby('Chick')

In [None]:
chick_df.describe()['Time']['count'].mean()

If every chicken appeared in the 12 days the mean of the coun should be 12

As we can see some chicken appeared less than 12 times in the database

We need to remove theese chicken and keep only the ones who appeared during the whole period

In [None]:
present = pd.DataFrame(np.array(chick_df.describe()['Time']['count'] == 12))

In [None]:
missing_chicken = np.array(present[present[0] == False].index) + 1
missing_chicken

In [None]:
missing_list = list(missing_chicken)

In [None]:
df['Present'] = df.apply(lambda x: not x['Chick'] in missing_chicken, axis=1)

In [None]:
present_chicken = df[df['Present']]

In [None]:
#Checking if only present chicken are listed
present_chicken['Present'].value_counts()

In [None]:
present_chicken.drop('Present', axis=1, inplace=True)

# Now we can analyse without bias:

In [None]:
sns.lmplot(data=present_chicken, x='Time', y='weight', hue='Diet')
plt.title('weight x time')

In [None]:
sns.catplot(data=present_chicken, x="Time", y="weight", col="Diet", col_wrap=2, kind='box')

In [None]:
first_day = present_chicken[present_chicken['Time'] == present_chicken['Time'].min()]
last_day = present_chicken[present_chicken['Time'] == present_chicken['Time'].max()]

#the Time and presence is already implicit, so we'll drop it
first_day.drop('Time', axis=1, inplace=True)
last_day.drop('Time', axis=1, inplace=True)

In [None]:
first_day.columns

In [None]:
first_day.drop(['Unnamed: 0', 'Diet'], axis=1, inplace=True)
first_day.columns = ['initial_weight', 'Chick']

In [None]:
last_day.drop(['Unnamed: 0'], axis=1, inplace=True)
last_day.columns = ['final_weight', 'Chick', 'Diet']

In [None]:
comparing = pd.concat([first_day.set_index('Chick'), last_day.set_index('Chick')], axis=1)

In [None]:
comparing['weight_variation'] = comparing['final_weight'] - comparing['initial_weight']

In [None]:
comparing.groupby('Diet').describe()['weight_variation']

# Results:
* The diets 3 and 4 seems to be the better ones, so we're going to compare them
* Diet 3 seems to be better in average, diet 4 is also good and have a higher min value and less standard deviation which is also good

Concluding: the better choice is **diet 3**, since chicken are sold by **weight**, and this diet have been better in average, but diet 4 is also acceptable.