## 1. Import Libraries

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
import random

## 2. Load Data

In [None]:
df = pd.read_csv("../input/life-expectancy-who/Life Expectancy Data.csv")

## 3. Understanding the data

In [None]:
df.info()

In [None]:
df.describe()  

In [None]:
df.head()

In [None]:
num_col = df.select_dtypes(include=np.number).columns
print("Numerical columns: \n",num_col)

cat_col = df.select_dtypes(exclude=np.number).columns
print("Categorical columns: \n",cat_col)

## 4. Data Pre-processing

In [None]:
# Remove the extra space from column names
df = df.rename(columns=lambda x: x.strip())

In [None]:
label_encoder = preprocessing.LabelEncoder() 
  
df['Status']= label_encoder.fit_transform(df['Status'])
  
df.head()

In [None]:
print(df.isna().sum())
print(df.shape)

In [None]:
# Replace using mean 
for i in df.columns.drop('Country'):
    df[i].fillna(df[i].mean(), inplace = True)

In [None]:
df.head()

In [None]:
print(df.isna().sum())

## 5. Exploratory Data Analysis

In [None]:
# Let's check the distribution of y variable (Life Expectancy)
plt.figure(figsize=(8,8), dpi= 80)
sns.boxplot(df['Life expectancy'])
plt.title('Life expectancy Box Plot')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.title('Life expectancy Distribution Plot')
sns.distplot(df['Life expectancy'])

In [None]:
num_col = df.select_dtypes(include=np.number).columns
print("Numerical columns: \n",num_col)

cat_col = df.select_dtypes(exclude=np.number).columns
print("Categorical columns: \n",cat_col)

In [None]:
# Let's check the multicollinearity of features by checking the correlation matric

plt.figure(figsize=(15,15))
p=sns.heatmap(df[num_col].corr(), annot=True,cmap='RdYlGn',center=0) 

In [None]:
# Pair Plots to know the relation between different features
ax = sns.pairplot(df[num_col])

## 6. Model Building

In [None]:
# Train test split
X=df.drop(columns=['Life expectancy','Country'])
y=df[['Life expectancy']]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X, y)

In [None]:
r_sq = model.score(X, y)
print('coefficient of determination:', r_sq)

In [None]:
print('intercept:', model.intercept_)

In [None]:
print('slope:', model.coef_)