In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# World Population Analysis

## Variables:
* Year: 1951 to 2020
* Population: World Population
* ChangePerc: Yearly Change in Percentage
* NetChange: Total Yearly Change
* Density: Density in P/Km²
* Urban: Urban Population
* UrbanPerc: Urban Population Percentage

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## Reading and Exploring Data

In [None]:
df = pd.read_csv('/kaggle/input/world-population-by-year/WorldPopulation.csv')
df

## Checking for null values

In [None]:
m = mn.matrix(df)

In [None]:
df.describe()

### Sorting dataFrame in ascending order

In [None]:
df = df.sort_values(by='Year')
df = df.reset_index()
df.drop(columns=['index'], inplace=True)
df

## Data Visualization

In [None]:
temp = df.set_index('Year')

In [None]:
fig, ax = plt.subplots(2,3,figsize=(12,8))
ax[0,0].plot(temp['Population'])
ax[0,0].set_title('Population')

ax[0,1].plot(temp['ChangePerc'],'r')
ax[0,1].set_title('ChangePerc')

ax[0,2].plot(temp['NetChange'],'g')
ax[0,2].set_title('NetChange')

ax[1,0].plot(temp['Density'])
ax[1,0].set_title('Density')

ax[1,1].plot(temp['Urban'],'r')
ax[1,1].set_title('Urban')

ax[1,2].plot(temp['UrbanPerc'],'g')
ax[1,2].set_title('UrbanPerc')

fig.tight_layout()
plt.show()

- Density, Urban Population and Urban Population have linear relationship with Population
- Change percentage and Net change have irregular behaviour

## Correlation Heatmap

In [None]:
h = sns.heatmap(df.corr(), annot=True)

# Model Training and Prediction

### Features
1. Year: 1951 to 2020
2. ChangePerc: Yearly Change in Percentage
3. NetChange: Total Yearly Change
4. Density: Density in P/Km²
5. Urban: Urban Population
6. UrbanPerc: Urban Population Percentage

### Target
* Population: World Population

### Four regression models are used
1. Linear Regression
2. Support Vector Regression
3. Random Forest Regression
4. XGB Regression

In [None]:
X = df.drop(columns=['Population'])
Y = df['Population']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
models = {'Linear Regression':LinearRegression(),
          'Support vector Regression':SVR(),
          'Random Forest Regression':RandomForestRegressor(),
          
          'XGBRegressor':XGBRegressor()}

In [None]:
def fit_and_score(models,X_train,Y_train,X_test, Y_test):
    scores = {}
    for name,model in models.items():
        model.fit(X_train,Y_train)
        scores[name] = model.score(X_test, Y_test)*100
    scores = pd.DataFrame(scores, index=['Score']).transpose()
    scores = scores.sort_values('Score',ascending=False)
    return scores

In [None]:
scores = fit_and_score(models,X_train,Y_train,X_test, Y_test)
scores

#### Above data shows that Linear Regression gives the best score(99.97)

## Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_preds = model.predict(X_test)

### Cross Validation on Regression model

In [None]:
def cv_score(model, X, Y, cv=5):
    np.random.seed(42)
    cv_score = cross_val_score(model,X,Y,cv=cv,scoring='r2')*100
    cv_mean = cv_score.mean()
    return cv_mean

In [None]:
cv_mean = cv_score(model, X_train, Y_train)
cv_mean

#### Linear Regression model gives a cross validation score of 99.98%

### Prediction Visualization

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(X_train['Year'],Y_train,label='Training data',s=10)
plt.scatter(X_test['Year'],Y_preds,c='r',label='Prediction')
plt.legend()
plt.show()