In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skewnorm
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error,r2_score

%matplotlib inline

# Data Description

In [None]:
pop_dataset=pd.read_csv("../input/world-population-by-year/WorldPopulation.csv",index_col='Year')
pop_dataset.head()

In [None]:
pop_dataset.shape

In [None]:
pop_dataset.info()

In [None]:
pop_dataset.describe()

In [None]:
print(pop_dataset.isnull().sum())

No data missing in the given dataset.

# Data Visualization

In [None]:
RP=plt.figure(figsize=(10,10))
for i, feature in enumerate(pop_dataset.columns):
    r=RP.add_subplot(3,2,i+1)
    plt.hist(pop_dataset[feature])
    r.set_title(feature+" Histogram Plot",color='DarkRed')
RP.tight_layout()  

In [None]:
RP=plt.figure(figsize=(10,10))
for i, feature in enumerate(pop_dataset.columns):
    r=RP.add_subplot(3,2,i+1)
    sns.distplot(pop_dataset[feature],rug=True,bins=10,fit=skewnorm,kde=False) #for normalising, used skewnorm
    r.set_title(feature+" Distribution",color='DarkRed')
    if(feature=='Population' or feature=='Urban'):
        plt.yscale('log')
RP.tight_layout()  

In [None]:
RP=plt.figure(figsize=(10,10))
for i, feature in enumerate(pop_dataset.columns):
    r=RP.add_subplot(3,2,i+1)
    sns.distplot(pop_dataset[feature],rug=True,bins=10,fit=norm,kde=False) #for normalising, used skewnorm
    r.set_title(feature+" Distribution",color='DarkRed')
    if(feature=='Population' or feature=='Urban'):
        plt.yscale('log')
RP.tight_layout()  

#### Right skewed distribution:
Population Distribution, Density Distribution, Urban Distribution
#### Left skewed distribution:
Change Percent Distribution, Net Change Distribution
#### Approximately normal distribution:
Urban Percentage Distribution

In [None]:
RP=plt.figure(figsize=(10,10))
for i, feature in enumerate(pop_dataset.columns):
    r=RP.add_subplot(3,2,i+1)
    sns.lineplot(pop_dataset.index,pop_dataset[feature])
    r.set_title(feature+" by Year",color='DarkRed')
RP.tight_layout()  

In [None]:
pop_correl=pop_dataset.corr()
print(pop_correl)

In [None]:
sns.heatmap(pop_correl,annot=True)

In [None]:
plt.figure(figsize=(5,200))
sns.heatmap(pop_dataset,annot=True,square=True)

In [None]:
sns.pairplot(pop_dataset)

Population is directly related to Urban population and Urban population percent.
And its obvious with the density (population per square km area) to increase with the increasing population.
Population is almost inversely related to percentage change in population.

In [None]:
RP=plt.figure(figsize=(10,10))
for i, feature in enumerate(pop_dataset.columns):
    r=RP.add_subplot(3,2,i+1)
    plt.boxplot(pop_dataset[feature],data=pop_dataset)
    r.set_title(feature,color='DarkRed')
RP.tight_layout()  

From the box-plot, Net change in population has a outlier. So, removing the feature for final model evaluation.

Since, from the heat map and pair plot, it can be seen that urban percent and urban are highly correlated and also, the features almost caters same practical use, but since, urban percent is approximately normalized, so, removing urban in the feature column will prevent the model from overfitting.

# Models

In [None]:
X=pop_dataset.drop(['Population','Urban','NetChange'],axis=1)
X=pd.DataFrame(scale(X))
X.index=pop_dataset.index
Y=pop_dataset['Population']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

Feature scaling is the process of normalising the range of features in a dataset. Real-world datasets often contain features that are varying in degrees of magnitude, range and units. Therefore, in order for machine learning models to interpret these features on the same scale, we need to perform feature scaling.

### Linear Regression

In [None]:
model_LR=LinearRegression()
model_LR.fit(X_train,Y_train)
Y_pred_LR=model_LR.predict(X_test)
print('Score:',model_LR.score(X_test,Y_test)*100)

In [None]:
plt.scatter(X_test.index, Y_test, color = 'red')
sns.lineplot(X_test.index,Y_pred_LR, color = 'blue')
plt.title('World Population Prediction: Linear Regression')
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
print('Mean Absolute Error:',mean_absolute_error(Y_test,Y_pred_LR))
print('Linear Regression Coefficients:',model_LR.coef_)
print('R2 Score:',r2_score(Y_test,Y_pred_LR))
print('Mean Square Error:',mean_squared_error(Y_test,Y_pred_LR))

### Logistic Regression

In [None]:
model_LGR=LogisticRegression()
model_LGR.fit(X_train,Y_train)
Y_pred_LGR=model_LGR.predict(X_test)

In [None]:
plt.scatter(X_test.index, Y_test, color = 'red')
sns.lineplot(X_test.index,Y_pred_LGR, color = 'blue')
plt.title('World Population Prediction: Logistic Regression')
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
print('Mean Absolute Error:',mean_absolute_error(Y_test,Y_pred_LGR))
print('Linear Regression Coefficients:\n',model_LGR.coef_)
print('R2 Score:',r2_score(Y_test,Y_pred_LGR))
print('Mean Square Error:',mean_squared_error(Y_test,Y_pred_LGR))

### Kernel Ridge

In [None]:
model_KR=KernelRidge(kernel='rbf', alpha=0.1, gamma=0.1)
model_KR.fit(X_train,Y_train)
Y_pred_KR=model_KR.predict(X_test)
print('Score:',model_KR.score(X_test,Y_test)*100)

In [None]:
plt.scatter(X_test.index, Y_test, color = 'red')
sns.lineplot(X_test.index,Y_pred_KR, color = 'blue')
plt.title('World Population Prediction: Kernel Ridge')
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
print('Mean Absolute Error:',mean_absolute_error(Y_test,Y_pred_KR))
print('R2 Score:',r2_score(Y_test,Y_pred_KR))
print('Mean Square Error:',mean_squared_error(Y_test,Y_pred_KR))

### XGBoost Regressor

In [None]:
model_XGB = XGBRegressor(n_estimators=500, learning_rate=0.05)
model_XGB.fit(X_train, Y_train, early_stopping_rounds=5, eval_set=[(X_test, Y_test)], verbose=False)
Y_pred_XGB=model_XGB.predict(X_test)
print('Score:',model_XGB.score(X_test,Y_test)*100)

In [None]:
plt.scatter(X_test.index, Y_test, color = 'red')
sns.lineplot(X_test.index,Y_pred_XGB, color = 'blue')
plt.title('World Population Prediction: XGBoost Regressor')
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
print('Mean Absolute Error:',mean_absolute_error(Y_test,Y_pred_XGB))
print('R2 Score:',r2_score(Y_test,Y_pred_XGB))
print('Mean Square Error:',mean_squared_error(Y_test,Y_pred_XGB))

### Random Forest Regressor

In [None]:
model_RFR = RandomForestRegressor(n_estimators=100, random_state=0)
model_RFR.fit(X_train,Y_train)
Y_pred_RFR=model_RFR.predict(X_test)
print('Score:',model_RFR.score(X_test,Y_test)*100)

In [None]:
plt.scatter(X_test.index, Y_test, color = 'red')
sns.lineplot(X_test.index,Y_pred_RFR, color = 'blue')
plt.title('World Population Prediction: Random Forest Regressor')
plt.xlabel('Year')
plt.ylabel('Population')
plt.show()

In [None]:
print('Mean Absolute Error:',mean_absolute_error(Y_test,Y_pred_RFR))
print('R2 Score:',r2_score(Y_test,Y_pred_RFR))
print('Mean Square Error:',mean_squared_error(Y_test,Y_pred_RFR))