# Brain weight in humans Regression

Going to take the following approach:

1. Problem definition
2. Data
3. Evaluation
4. Features
5. Modelling
6. Model Evaluation
7. Experimentation / Improvements

# 1. Problem Definition

How we can use various python based Machine Learning Model and the given parameters to predict The weight of a human brain?

# 2. Data

Data from: https://www.kaggle.com/anubhabswain/brain-weight-in-humans

## Context

This dataset was compiled using a medical study conducted on a group of people.

## Content

This dataset shows a few variations of head sizes and masses of brains, it also consists additional gender and age group columns.


# 3. Evaluation

As this is a Regression problem, we will use the Root mean square error for evauluting the model

# 4. Features

## Inputs / Features

    1. Gender - 1 represents Male , 2 represents Female
    2. Age Range - 1 represents >18 years of age , 2 represents <18 years of age
    3. Head Size(cm^3) - Head volume in cubic centimetres

## Output / Label
    4. Brain Weight(grams)Mass of brains in grams

## Standard Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading the DataSet

In [None]:
# Local
# df = pd.read_csv('Data/dataset.csv')

# Kaggle
df = pd.read_csv('/kaggle/input/brain-weight-in-humans/dataset.csv')
df.head()

## Data Exporation

In [None]:
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(20,10))
plt.title('Histogram of Brain Weight(grams)')
sns.histplot(data=df, x='Brain Weight(grams)', kde=True, bins=15);

In [None]:
plt.figure(figsize=(20,10))
plt.title('Histogram of Brain Weight(grams), colored by Gender')
sns.histplot(data=df, x='Brain Weight(grams)',hue='Gender', kde=True, bins=15);

In [None]:
plt.figure(figsize=(20,10))
plt.title('Plot of Head Size(cm^3) vs Brain Weight(grams), colored by Gender')
sns.scatterplot(data=df, x='Brain Weight(grams)',y='Head Size(cm^3)', hue='Gender',s=150);

In [None]:
plt.figure(figsize=(20,10))
plt.title('Plot of Head Size(cm^3) vs Brain Weight(grams), colored by Age Group')
sns.scatterplot(data=df, x='Brain Weight(grams)',y='Head Size(cm^3)', hue='Age Range',s=150);

## Outlier detection

In [None]:
plt.figure(figsize=(20,10))
plt.title('Boxplot of Brain Weight(grams)')
sns.boxplot(data=df, x='Brain Weight(grams)');

In [None]:
plt.figure(figsize=(20,10))
plt.title('Boxplot of Head Size(cm^3)')
sns.boxplot(data=df, x='Head Size(cm^3)');

In [None]:
df[(df['Brain Weight(grams)']<1000) | (df['Brain Weight(grams)'] > 1570)]

In [None]:
df[df['Head Size(cm^3)'] > 4600]

As from the boxplot we can see that there are outlier in this dataset.
As We are no domain expert in this field, we will choose to keep in the dataset as the data suggest that is linear

# 5. Modelling

In [None]:
X = df.drop('Brain Weight(grams)', axis=1)
y = df['Brain Weight(grams)']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Import Models

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

## Baseline Models and Scores

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_scores = {}
    model_rsme = {}
    model_r2 = {}
    
    for name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test,y_test)
        y_preds = model.predict(X_test)
        model_rsme[name] = np.sqrt(mean_squared_error(y_test,y_preds))
        model_r2[name] = r2_score(y_test,y_preds)

    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')
    
    model_rsme = pd.DataFrame(model_rsme, index=['RSME']).transpose()
    model_rsme = model_rsme.sort_values('RSME')
    
    model_r2 = pd.DataFrame(model_r2, index=['R2']).transpose()
    model_r2 = model_r2.sort_values('R2')
        
    return model_scores,model_rsme, model_r2

In [None]:
models = {'Ridge' : Ridge(),
         'Lasso': Lasso(),
         'ElasticNet': ElasticNet(),
         'KNeighborsRegressor': KNeighborsRegressor(),
         'SVR': SVR(),
         'DecisionTreeRegressor': DecisionTreeRegressor(),
         'RandomForestRegressor':RandomForestRegressor(),
         'GradientBoostingRegressor': GradientBoostingRegressor(),
         'AdaBoostRegressor': AdaBoostRegressor(),
        'XGBRegressor': XGBRegressor(objective='reg:squarederror'),
        'XGBRFRegressor': XGBRFRegressor(objective='reg:squarederror'),
          'CatBoostRegressor': CatBoostRegressor(verbose=0)
         }

In [None]:
model_scores_baseline, model_rsme_baseline, model_r2_baseline = fit_and_score(models, X_train, X_test, y_train, y_test)

In [None]:
model_scores_baseline

In [None]:
model_rsme_baseline.sort_values('RSME', ascending=False)

In [None]:
model_r2_baseline

We will use the Ridge model and Tune the hyperparameter to see how it perform

## Grid CV search

In [None]:
from sklearn.model_selection import GridSearchCV
from warnings import filterwarnings

In [None]:
filterwarnings('ignore')

In [None]:
def gridsearch_cv_scores(models, params, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_gs_scores = {}
    model_gs_best_param = {}
    
    for name, model in models.items():
        gs_model = GridSearchCV(model,
                                param_grid=params[name],
                                scoring='neg_root_mean_squared_error',
                                n_jobs=-1,
                                cv=5,
                                verbose=2)
        
        gs_model.fit(X_train,y_train)

        model_gs_scores[name] = gs_model.score(X_test,y_test)
        model_gs_best_param[name] = gs_model.best_params_

    model_gs_scores = pd.DataFrame(model_gs_scores, index=['neg_mean_squared_error'])
    model_gs_scores = model_gs_scores.transpose().sort_values('neg_mean_squared_error')
        
    return model_gs_scores, model_gs_best_param

### Baseline Grid Search CV

In [None]:
models = {'Ridge': Ridge()}
params = {'Ridge':{}}

In [None]:
model_gs_scores_base, model_gs_best_param_base =  gridsearch_cv_scores(models, params, X_train, X_test, y_train, y_test)

In [None]:
model_gs_scores_base

### GS model 1

In [None]:
models = {'Ridge': Ridge()}

params = {'Ridge':{'alpha': np.arange(0,1,0.01),
                  'fit_intercept': [True,False],
                  'normalize':[True, False],
                  'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
         }

In [None]:
model_gs_scores_1, model_gs_best_param_1 =  gridsearch_cv_scores(models, params, X_train, X_test, y_train, y_test)

In [None]:
model_gs_scores_1

In [None]:
model_gs_best_param_1

Since there is no improvment with the grid search CV, we will use the base model and do a Evalution on it.

# 6. Model Evalution 

In [None]:
model = Ridge()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

## Feature Importance

In [None]:
feat_importance = pd.DataFrame(model.coef_, index=X.columns)

In [None]:
feat_importance

In [None]:
plt.figure(figsize=(20,10))
plt.title('Feature Importance')
sns.barplot(data= feat_importance.sort_values(0).T);

## Metrics Evalution

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
r2 = r2_score(y_test,y_preds)
mae = mean_absolute_error(y_test, y_preds)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

In [None]:
print(f'R2 Score: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Square Error: {mse}')
print(f'Root Mean Square Error: {rmse}')