# 0) Dependencise

In [39]:
import sys
from pathlib import Path

# data processing libraries
import pandas as pd
import numpy as np
import datetime
import re

# modeling 
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# here put thoses algorithms
from sklearn.linear_model import LinearRegression


# metrics 
from scipy.optimize import least_squares
from sklearn.metrics import mean_absolute_error

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm


# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 5] #change size of plot
import seaborn as sns
import plotly.express as px

# 1) Data Acquistion

A quick look at:
1. Data Structure / Data Info
2. Categorical Variables
3. Numerical Variables


In [None]:
life_expectancy_data = pd.read_csv("Life Expectancy Data.csv")

## 1.1) A quick look at the data structure

In [21]:
df = life_expectancy_data.copy()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# Trim the spaces in the column names
trimed_col = []
for col in df.columns:
    trimed_name = col.strip()
    trimed_name = trimed_name.replace("  "," ")

    if trimed_name not in ['BMI', 'HIV/AIDS','GDP']:
        trimed_col.append(trimed_name.title())
    else:
        trimed_col.append(trimed_name)
    
df.columns = trimed_col

# Note that according to the data dictionary, the first column of thinness represent
# Rate of thinness among people aged 10-19. Hence we should rename the column from '1-19' to '10-19'.
df.rename(columns = {'Thinness 1-19 Years':'Thinness 10-19 Years'},inplace = True)
df.columns

In [None]:
df.info()

## 1.2) A quick look at the distribution of the categorical variables

In [None]:
df['Country'].value_counts()

In [None]:
df['Status'].value_counts()

## 1.3) A quick look at the distribution of numerical variables

In [None]:
df.describe()

In [40]:
IMAGES_PATH = Path() / "images" / "life_expectancy"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

df.hist(bins=50, figsize=(12, 8))
save_fig("attribute_histogram_plots")  
plt.show()

## 1.4) Sample a test set (remain unseen)

### 1.4.1) Method 1:  Randomly split the data using np.randodm.permutation

In [65]:
def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [70]:
np.random.seed(123)

train_set, test_set = shuffle_and_split_data(df, 0.2)

print(len(train_set),len(test_set))

2351 587


In [None]:
train_set.isnull().sum()

In [None]:
test_set.isnull().sum()

### 1.4.2) Method 2: Using hash() function

In [49]:
from zlib import crc32

def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [60]:
df_with_id = df.reset_index()  # adds an `index` column
train_set, test_set = split_data_with_id_hash(df_with_id, 0.2, "index")

In [61]:
print(len(train_set),len(test_set))

2348 590


In [None]:
train_set.isnull().sum()

In [None]:
test_set.isnull().sum()

### 1.4.3) Method 3: Using sklearn model selection

In [56]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=123)

In [57]:
print(len(train_set),len(test_set))

2350 588


In [None]:
train_set.isnull().sum()

In [None]:
test_set.isnull().sum()

# 2) Data Exploration

In this step, we conduct the following:
- Data Visualization
- Missing Data (%)
- Quantile Statistics
- Descriptive Statistics
- Correlation
- Attribute Combinations

# 3) Data Cleaning

1. Dealing with Missing data
2. Cleaning data
3. Outlier Detection
4. Data Preprocessing
5. Feature Selection
6. Feature engineering
7. Feature Scaling
8. Clustering
9. Imbalanced data

# 4) Modelling

1. Training
2. Cross-Validation
3. Fine tuning
4. Evaluate on test set