**Basics computaion libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

**Imputer**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [None]:
import statsmodels.api as sm
from scipy import stats
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet,ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Life expectancy dataset

In [None]:
pd.set_option('display.max_columns', 30)
# pd.set_option("max_columns", 2) #Showing only two columns

In [None]:
life_df = pd.read_csv("../input/life-expectancy-who/Life Expectancy Data.csv")

In [None]:
life_df.head(5)

In [None]:
life_df.columns

In [None]:
life_df.shape

In [None]:
life_df.info()

In [None]:
life_df.describe()

In [None]:
life_df.isnull().sum().plot(kind='bar')

# ----------------------------------------------------------------------------
# Did basic testing using different techniques

# Pandas profiling

In [None]:
# profile = ProfileReport(life_df, title = "Life expectancy report")
# profile.to_file("expectancy.html")

# Imputer

**Removing spaces in column name**

In [None]:
life_df = life_df.rename(columns= lambda x: x.strip())

**Filtering more than 20% null values**

In [None]:

twenty_percent = (life_df.shape[0]/100)*20
for col in life_df.columns:
    life_df[col].isnull().sum()
    if (life_df[col].isnull().sum()) >= twenty_percent:
        print("20%-'",col,"'")
    elif (life_df[col].isnull().sum()) > 0:
        print("> 0 %-'", col,"'")

In [None]:
plt.hist(life_df['Adult Mortality'])

**Histogram with a line on it**

In [None]:
sns.distplot(life_df['Adult Mortality'])

In [None]:
check_expectancy = life_df["Adult Mortality"][~ np.isnan(life_df["Adult Mortality"])]
nan_expectancy = life_df["Adult Mortality"].copy()

In [None]:
plt.boxplot(check_expectancy)

**Simple imputer**

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
dummy_simple = imputer.fit_transform(nan_expectancy.values.reshape(-1,1))

In [None]:
plt.boxplot(dummy_simple)

In [None]:
sns.distplot(dummy_simple)

**KNN imputer**

In [None]:
knnimpute = KNNImputer(n_neighbors=4)
dummy_knn = knnimpute.fit_transform(nan_expectancy.values.reshape(-1,1))

In [None]:
sns.distplot(dummy_knn)

**Checking normal distribution**

In [None]:
sm.qqplot(dummy_knn, fit=True, line="45")

In [None]:
plt.boxplot(dummy_knn)

In [None]:
# sm.qqplot(dummy_knn, fit=True, line="45")

In [None]:
scale_data = preprocessing.scale(dummy_knn)

In [None]:
# sm.qqplot(scale_data, fit=True, line="45")

**Standard scalar**

In [None]:
standard_scale = preprocessing.StandardScaler()

In [None]:
standard_scale_data = standard_scale.fit_transform(dummy_knn)

In [None]:
sm.qqplot(standard_scale_data, fit=True, line="45")

In [None]:
standard_scale_data[:10]

In [None]:
plt.boxplot(standard_scale_data)

In [None]:
plt.hist(standard_scale_data)

In [None]:
sns.distplot(standard_scale_data)

**Converting values to normal or gaussian distribution by Box-Cox technique**

In [None]:
transformer = preprocessing.PowerTransformer(method="box-cox", standardize= True)

In [None]:
dummy_power = transformer.fit_transform(dummy_knn)

In [None]:
sm.qqplot(dummy_power, fit=True, line="45")

In [None]:
sns.distplot(dummy_power)

In [None]:
plt.hist(dummy_power)

# ----------------------------------------------------------------------------
# End of testing basic techniques

In [None]:
life_df.info()

In [None]:
life_df_numeric = life_df.select_dtypes(exclude="object")
life_df_object = life_df.select_dtypes(include="object")

In [None]:
life_df_numeric.columns

In [None]:
# life_df_numeric = life_df_numeric.loc[:,['Measles', 'under-five deaths']]

In [None]:
life_df_numeric.loc[:,'Measles'] = life_df_numeric.loc[:,'Measles'].astype(float)
life_df_numeric.loc[:,'under-five deaths'] = life_df_numeric.loc[:,'under-five deaths'].astype(float)
life_df_numeric.loc[:,'infant deaths'] = life_df_numeric.loc[:,'infant deaths'].astype(float)


In [None]:
life_df_numeric.info()

In [None]:
life_df_numeric.isna().sum()

In [None]:
before_outlier = life_df_numeric.copy()

In [None]:
life_df_numeric.describe()

**Detecting outliers by using the IQR values and Replacing the outliers with the null values**

In [None]:

for col in life_df_numeric.columns:
    q1 = life_df_numeric[col].quantile(0.25)
    q3 = life_df_numeric[col].quantile(0.75)
    iqr = q3-q1
    liqr = q1 - (1.5 * iqr)
    hiqr = q3 + (1.5 * iqr)
#     print(iqr, hiqr)

#     print('\n',col, '\niqr: ',iqr,'\nminimum: ',np.min(life_df_numeric[col]), 'liqr: ',liqr,'\nmaximum: ', np.max(life_df_numeric[col]),'hiqr: ', hiqr)
    life_df_numeric.loc[:][col] = np.where((life_df_numeric.loc[:][col] <= liqr) | (life_df_numeric.loc[:][col] >= hiqr), np.nan , life_df_numeric.loc[:][col])
#     life_df_numeric.loc[:][col] = np.where((life_df_numeric.loc[:][col] >= hiqr), None, life_df_numeric.loc[:][col])

In [None]:
life_df_numeric

In [None]:
after_outlier = life_df_numeric.copy()

In [None]:
life_df_numeric.isna().sum()

In [None]:
life_df_numeric.describe()

In [None]:
for col in life_df_numeric.columns:
#     print(col,"\n\nMinimum\nBefore-",np.min(before_outlier[col]), "After",np.min(after_outlier[col]))
#     print("Maximum\nBefore",np.max(before_outlier[col]), "After",np.max(after_outlier[col]))
    q1b = before_outlier[col].quantile(0.25)
    q3b = before_outlier[col].quantile(0.75)
    iqrb = q3b-q1b
    liqrb = q1b - (1.5 * iqrb)
    hiqrb = q3b + (1.5 * iqrb)
    
    if np.min(after_outlier[col]) > liqrb:
        print
    print(col,'\nminimum- ',np.min(after_outlier[col]), 'liqr- ',liqrb,'\nmaximum- ', np.max(after_outlier[col]),'hiqr- ', hiqrb)
    print('\n')


**Merging object and non_object data**

In [None]:
life_df_final = pd.concat([life_df_numeric, life_df_object], axis=1)

**Filling NULL values using KNN**

In [None]:
knnimpute = KNNImputer(n_neighbors=3)
for col in life_df_final.columns:
    if (life_df_final[col].isnull().sum()) > 0:
        life_df_final[col] = knnimpute.fit_transform(life_df_final[col].values.reshape(-1,1))

In [None]:
life_df_final.isna().sum()

In [None]:
life_df_final.head(20)

In [None]:
for col in life_df_final.columns:
    if life_df_final[col].dtypes != 'object':
        q1 = life_df_final[col].quantile(0.25)
        q3 = life_df_final[col].quantile(0.75)
        iqr = q3-q1
        liqr = q1 - (1.5 * iqr)
        hiqr = q3 + (1.5 * iqr)
        print('\n',col,'--low---', (life_df_final[col] <= liqr).any())
        print(col,'--high---', (life_df_final[col] >= hiqr).any())
        print('iqr- ',iqr,'\nminimum- ',np.min(life_df_final[col]), 'liqr- ',liqr,'\nmaximum- ', np.max(life_df_final[col]),'hiqr- ', hiqr)

In [None]:
life_df_final.isna().sum()

**Histogram plot for all the features in the dataframe**

In [None]:
fig = plt.figure(figsize = (10,10))
ax = fig.gca()
life_df_final.hist(ax=ax)
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=1.5, wspace=0.4)
plt.show()

# KNN imputer imputes outlier values to null values

In [None]:
life_df_final.iloc[:,0:5].boxplot()

In [None]:
life_df_final.iloc[:,5:10].boxplot()

In [None]:
Y = life_df_final["Life expectancy"]

In [None]:
Y

In [None]:
X = life_df_final.drop("Life expectancy", axis=1)

In [None]:
X.head(5)

In [None]:
label = preprocessing.LabelEncoder()
X_numeric = X.apply(label.fit_transform)

In [None]:
X_numeric.head(5)

In [None]:
X_numeric[(X_numeric == 0).any(1)]

In [None]:
# transformer = preprocessing.PowerTransformer(method="box-cox", standardize= True)
transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
X_transform = transformer.fit_transform(X_numeric)

In [None]:
X_transform = pd.DataFrame(X_transform)

In [None]:
X_transform.columns = X_numeric.columns

In [None]:
X_transform.head()

In [None]:
fig = plt.figure(figsize = (10,10))
ax = fig.gca()
X_transform.hist(ax=ax)
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=1.5, wspace=1.5)
plt.show()

In [None]:
Y_transform = transformer.fit_transform(Y.values.reshape(-1,1))

In [None]:
plt.hist(Y_transform)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X_transform, Y_transform, test_size = 0.3, random_state=101)

# Linear regression

In [None]:
linear_model = LinearRegression()
linear_model.fit(xtrain, ytrain)

In [None]:
linear_model.coef_

In [None]:
ypredict = linear_model.predict(xtest)

In [None]:
linear_model.score(xtest, ytest)

In [None]:
print(mean_absolute_error(ytest, ypredict))

In [None]:
print(mean_squared_error(ytest, ypredict))

In [None]:
print(np.sqrt(mean_squared_error(ypredict, ytest)))

In [None]:
print(r2_score(ytest, ypredict))

In [None]:
ytest_actual = transformer.inverse_transform(ytest)

In [None]:
ypredict_actual = transformer.inverse_transform(ypredict)

In [None]:
print((np.in1d(ytest_actual, ypredict_actual)).shape)

In [None]:
print((np.intersect1d(ytest_actual, ypredict_actual)).shape)

**Reference - https://towardsdatascience.com/ridge-and-lasso-regression-a-complete-guide-with-python-scikit-learn-e20e34bcbf0b**

# Ridge regression

In [None]:
ridge_model = Ridge(alpha=0.01)
ridge_model.fit(xtrain, ytrain)

In [None]:
ypredict_ridge = ridge_model.predict(xtest)

print(ridge_model.score(xtest, ytest))

print(mean_absolute_error(ytest, ypredict_ridge))

print(mean_squared_error(ytest, ypredict_ridge))

In [None]:
ridge_model = Ridge(alpha=50)
ridge_model.fit(xtrain, ytrain)

In [None]:
ypredict_ridge = ridge_model.predict(xtest)

print(ridge_model.score(xtest, ytest))

print(mean_absolute_error(ytest, ypredict_ridge))

print(mean_squared_error(ytest, ypredict_ridge))

# Lasso regression

In [None]:
lasso_model = Lasso()
lasso_model.fit(xtrain, ytrain)

In [None]:
ypredict_lasso = lasso_model.predict(xtest)

print(lasso_model.score(xtest, ytest))

print("Number of features used:",np.sum(lasso_model.coef_!=0))

print(mean_absolute_error(ytest, ypredict_lasso))

print(mean_squared_error(ytest, ypredict_lasso))

**Notice the additional parameters defined in Lasso function – ‘max_iter‘. This is the maximum number of iterations for which we want the model to run if it doesn’t converge before. This exists for Ridge as as well but setting this to a higher than default value was required in this case.**

In [None]:
lasso_model = Lasso(alpha=0.01, max_iter=10e5)
lasso_model.fit(xtrain, ytrain)

In [None]:
ypredict_lasso = lasso_model.predict(xtest)

print(lasso_model.score(xtest, ytest))

print("Number of features used:",np.sum(lasso_model.coef_!=0))

print(mean_absolute_error(ytest, ypredict_lasso))

print(mean_squared_error(ytest, ypredict_lasso))

# ElasticNet and ElasticNetCV

In [None]:
elatic_model = ElasticNet(alpha=0.01)
elatic_model.fit(xtrain, ytrain)

In [None]:
elatic_model.coef_

In [None]:
ypredict_elastic = elatic_model.predict(xtest)

print(elatic_model.score(xtest, ytest))

print("Number of features used:",np.sum(lasso_model.coef_!=0))

print(mean_absolute_error(ytest, ypredict_elastic))

print(mean_squared_error(ytest, ypredict_elastic))

In [None]:
alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]
elasticnetcv_model = ElasticNetCV(alphas=alphas, cv=5)
elasticnetcv_model.fit(xtrain, ytrain)

In [None]:
ypredict_elastic = elasticnetcv_model.predict(xtest)

print(elasticnetcv_model.score(xtest, ytest))

print('Best alpha value: ',elasticnetcv_model.alpha_)

print('Intercept: ',elasticnetcv_model.intercept_)

print("Number of features used:",np.sum(lasso_model.coef_!=0))

print("Number of features not used:",np.sum(lasso_model.coef_==0))

print(mean_absolute_error(ytest, ypredict_elastic))

print(mean_squared_error(ytest, ypredict_elastic))

In [None]:
lasso_model.coef_

**Reference - https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499**