<a href="https://colab.research.google.com/github/srikanchana/unemployment-prediction/blob/main/Copy_of_Unemployment_rate_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'countries-of-the-world-2023:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3495122%2F6101670%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240730%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240730T095136Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Daf5cc7d7a890667ad54272dfa5d6455db6237d274289e5b2cf99be1b5448a205d9bf32ea5884fadbc4e70c8e8a5fc88d3aa278ff519faab498bc898327ca45aab8923fbab5a2bf206ebc3d49f6ca521ac42e7b7200feb6d9a3338f93dad3019a6c92653284531191af5b70bc7dc0c3b0667e150975dccf263a926e8f007e012f2d5819d580f56415823e80098f48f2bb0fcd87afc36cd88bb1f58df4b767799ab6257c6ecf3825401c674e641956b30330433fa89c353dcbdfaf7518834c990a144ef3271e8318c3af592bf821aef1d577ce9040d7663626f2a0c6d2b42563161d8f9e9250e5c639361926de7b40ee50c236070580c937b70df4ec21705e31f5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import re
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


from sklearn import metrics, preprocessing
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from plotly.subplots import make_subplots
from sklearn.preprocessing  import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, cross_val_score



Based on the dataset, we will test several different models for predicting the unemployment rate.

In [None]:
data = pd.read_csv('/kaggle/input/countries-of-the-world-2023/world-data-2023.csv')
data.head()

# 1. Primary data analysis

let's see the dimension of the data and data types

In [None]:
data.shape

In [None]:
data.info()

## 1.1 Convert data types in features

First, let's single out among the features of the object type those that are measured in percentages. Remove the percent sign and replace the type with float.

In [None]:
# List of features with object data type
object_list = [x for x in data.columns if data[x].dtype == 'object']

# List of features in which the '%' sign is indicated
object_list_percent = [x for x in object_list if '%' in x]+(['Out of pocket health expenditure', 'Total tax rate', 'Unemployment rate'])
print(object_list_percent)

# Remove the '%' sign and change the data type to float
for feature in data.columns:
    if feature in object_list_percent:
        data[feature] = data[feature].str.replace('%', '')
        data[feature] = data[feature].str.strip()
        data[feature] = data[feature].astype('float64')

In [None]:
# List of features with object data type
object_list = [x for x in data.columns if data[x].dtype == 'object']
object_list

In [None]:
# List of features to be transformed
columns_list = ['Density\n(P/Km2)', 'Land Area(Km2)', 'Armed Forces size', 'Co2-Emissions', 'CPI',
           'Gasoline Price', 'GDP', 'Minimum wage', 'Population', 'Urban_population']

# Remove ',' and '$'
for feature in data.columns:
    if feature in columns_list:
        data[feature] = data[feature].str.replace(',', '')
        data[feature] = data[feature].str.replace('$', '')

# Replacing data types with int and float
float_list = ['CPI', 'Gasoline Price', 'Minimum wage']
int_list = ['Density\n(P/Km2)', 'Land Area(Km2)', 'Armed Forces size', 'Co2-Emissions', 'GDP', 'Population', 'Urban_population']
for feature in data.columns:
    if feature in float_list:
        data[feature] = data[feature].astype('float64')
    elif feature in int_list:
        data[feature] = data[feature].astype('Int64')

In [None]:
data.info()

## 1.2 Handling gaps in data

Let's say we set a threshold of missing data of 30%, exceeding which the feature becomes uninformative and should be deleted.

In [None]:
# Percentage of missing data in features
cols_null_percent = data.isnull().mean() * 100
cols_with_null = cols_null_percent[cols_null_percent > 0].sort_values(ascending=False)
cols = list(cols_with_null.index)
display(cols_with_null)

In [None]:
# Heatmap of missing values
colors = ['blue', 'yellow']
fig = plt.figure(figsize=(10, 4))
cols = cols_with_null.index
ax = sns.heatmap(
    data[cols].isnull(),
    cmap = sns.color_palette(colors),
)
ax.set_title('Heatmap of missing values');

In [None]:
data[cols].info()

In [None]:
cat_list = [x for x in data.columns if data[x].dtype == 'object']
cat_list

In [None]:
# Replace missing values with 'unknown'
data[cat_list] = data[cat_list].fillna('unknown')
# Replace missing values with 0
data = data.fillna(0)
data.isnull().sum()

# 2. Exploratory data analysis

## 2.1. Let's analyze the target feature 'Unemployment rate'

In [None]:
fig = px.histogram(data,
                  x='Unemployment rate',
                  nbins=20,
                  text_auto=True,
                  marginal='box',
                  title='Histogram of the distribution of the feature "Unemployment rate"',
                  )

fig.show("notebook")

## 2.2. Let's look at the distribution of numerical variables

In [None]:
# List of numerical variables
num_list = [x for x in data.columns if data[x].dtype == 'float64' or data[x].dtype == 'Int64']
n = len(num_list)
print(n)

fig, axes = plt.subplots(10, 3, figsize=(20, 20), facecolor='gray')
plt.subplots_adjust(hspace=1.0, wspace=0.5)
plt.suptitle('Histograms of numerical variables',
             fontsize=22,
             fontweight='bold')
for col, ax in zip(num_list, axes.ravel()):
    histplot = sns.histplot(data,
                            x=col,
                            color='green',
                            ax=ax)
    histplot.set_title(col.upper())
    histplot.set_xlabel('')
    ax.grid()

In [None]:
fig, axes = plt.subplots(10, 3, figsize=(20, 20), facecolor='gray')
plt.subplots_adjust(hspace=1.0, wspace=0.5)
plt.suptitle('Boxplots of numerical variables',
             fontsize=22,
             fontweight='bold')
for col, ax in zip(num_list, axes.ravel()):
    histplot = sns.boxplot(data,
                            x=col,
                            color='blue',
                            ax=ax)
    histplot.set_title(col.upper())
    histplot.set_xlabel('')
    ax.grid()

## 2.3. Let's look at the distribution of categorical variables

In [None]:
cat_list = [x for x in data.columns if data[x].dtype == 'object']
cat_list

### Countries

In [None]:
# Top 20 countries with the highest unemployment rate
country_unemployment_max = data.groupby(by='Country', as_index=False)['Unemployment rate'].value_counts().nlargest(20, 'Unemployment rate')
# Top 200 countries with the lowest unemployment rate
country_unemployment_min = data.groupby(by='Country', as_index=False)['Unemployment rate'].value_counts().nsmallest(20,'Unemployment rate')

display('Top 20 countries with the highest unemployment rate', country_unemployment_max, )
print('-'*60)
display('Top 20 countries with the lowest unemployment rate', country_unemployment_min)


In [None]:
fig=px.bar(
    country_unemployment_max,
    y='Country',
    x='Unemployment rate',
    text_auto=True,
    color='Country',
    orientation='h',
    title='Top 20 countries with the highest unemployment rate',
    height=800,
    width=1200)

fig.show("notebook")

### Largest city

In [None]:
largest_city = data.groupby(by='Largest city', as_index=False)['Unemployment rate'].value_counts().nlargest(10, 'Unemployment rate')

fig = px.bar(largest_city,
            y='Largest city',
            x='Unemployment rate',
            orientation='h',
            color='Largest city',
            text_auto=True)

fig.show("notebook")

### The dependence of the unemployment rate on the number of urban population

In [None]:
fig = px.scatter(
    data,
    y='Urban_population',
    x='Unemployment rate',
    color='Birth Rate'
)

fig.show("notebook")

# 3. Data preparation

## 3.1. Feature coding

In [None]:
cat_list

Label Encoder is a method that is used to convert categorical columns to numeric so that they can fit machine learning models that only accept numeric data. It assigns each category in the feature a value between 0 and $n$, where $n$ is the number of categories.

In [None]:
for feature in cat_list:
    encoder = LabelEncoder()
    data[feature] = encoder.fit_transform(data[feature])

In [None]:
data.head()

## 3.2. Feature correlation analysis

In [None]:
# Let's build a feature correlation matrix
plt.figure(figsize=(32, 28), facecolor='gray')
sns.heatmap(data.corr(),
            annot=True,
            fmt='.1g',
            cmap='coolwarm',
            linecolor='blue',
            linewidths=1)
plt.title('Feature correlation matrix',
          fontsize=18,
          fontweight='bold');

In order to avoid the problems associated with multicolleniality. In pairs of features, where the correlation coefficient is greater than or equal to 0.9 or less than or equal to -0.9, delete one of the features.

In [None]:
multicolleniar = ['Birth Rate', 'Armed Forces size', 'Co2-Emissions', 'CPI', 'Infant mortality', 'Urban_population']
data = data.drop(multicolleniar, axis=1)

# 4. Modeling

In [None]:
# Target veriable
y = data['Unemployment rate']
# Train data
X = data.drop('Unemployment rate', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [None]:
# Let's look at the dimensions of the training and test samples
print('Train:', X_train.shape)
print('Test:', X_test.shape)

In [None]:
# The function creates a dictionary with metric values
def get_dict_metrics(y_train, y_train_predict, y_test, y_test_predict):
    dict_metrics = {
                'MAE score Train' : round(metrics.mean_absolute_error(y_train, y_train_predict), 2),
                'MAE score Test' : round(metrics.mean_absolute_error(y_test, y_test_predict), 2),
                'RMSE score Train' : round(np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)), 2),
                'RMSE score Test' : round(np.sqrt(metrics.mean_squared_error(y_test, y_test_predict)), 2),
                'R^2 score Train' : round(metrics.r2_score(y_train, y_train_predict), 2),
                'R^2 score Test' : round(metrics.r2_score(y_test, y_test_predict), 2)
                }
    return dict_metrics

## 4.1. Linear regression

In [None]:
model_lr = linear_model.LinearRegression()
model_lr.fit(X_train, y_train)
y_train_predict = model_lr.predict(X_train)
y_test_predict = model_lr.predict(X_test)
metrics_lr = get_dict_metrics(y_train, y_train_predict, y_test, y_test_predict)
metrics_lr


## 4.2. Random Forest Regressor

In [None]:
model_rf = ensemble.RandomForestRegressor(n_estimators=100,
                                         max_depth=10,
                                         random_state=42)
model_rf.fit(X_train, y_train)
y_train_predict = model_rf.predict(X_train)
y_test_predict = model_rf.predict(X_test)
metrics_rf = get_dict_metrics(y_train, y_train_predict, y_test, y_test_predict)
metrics_rf

In [None]:
plt.rcParams['figure.figsize'] = (6, 6)
feat_importances = pd.Series(model_rf.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

### Hyperparameters tuning with gridsearch

In [None]:
# Hyperparameter grid
param_grid = {'n_estimators' : [50, 100, 150, 200, 300], # Number of trees
              'min_samples_split': [2, 5, 7, 10], # Minimum number of samples to split
              'max_depth': [5, 10, 20, 30, 50], # Maximum tree depth
              'min_samples_leaf' : [1, 2, 3, 4], # Minimum number of objects in a sheet
              'max_features' : ['sqrt', 'log2'] # Maximum number of features that will be used by each of the trees
              }
# Create an object gridsearch
grid_search = GridSearchCV(
    estimator=ensemble.RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1)

# Model traning
%time grid_search.fit(X_train, y_train)
# Best Hyperparameter Values
print('Best Hyperparameter Values:', grid_search.best_params_)
print()
# Prediction and metrics
y_train_predict = grid_search.predict(X_train)
y_test_predict = grid_search.predict(X_test)
metrics_rf_grid = get_dict_metrics(y_train, y_train_predict, y_test, y_test_predict)
metrics_rf_grid


In [None]:
plt.rcParams['figure.figsize'] = (6, 6)
feat_importances = pd.Series(model_rf.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

## 4.3. Gradient boosting

In [None]:
# Create a Gradient Boosting regressor object
model_gbr = ensemble.GradientBoostingRegressor(random_state=42)
# Model traning
%time model_gbr.fit(X_train, y_train)
# Prediction and metrics
y_train_predict = model_gbr.predict(X_train)
y_test_predict = model_gbr.predict(X_test)
metrics_gbr = get_dict_metrics(y_train, y_train_predict, y_test, y_test_predict)
metrics_gbr

### Hyperparameters tuning with gridsearch

In [None]:
# Hyperparameter grid
param_grid = {'n_estimators' : [100, 200, 300, 400, 500], # Number of trees
              'min_samples_split': [2, 5, 7, 10], # Minimum number of samples to split
              'max_depth': [2, 3, 4, 5, 6, 7, 8], # Maximum tree depth
              'min_samples_leaf' : [1, 2, 3, 4], # Minimum number of objects in a sheet
              'max_features' : ['sqrt', 'log2'], # Maximum number of features that will be used by each of the trees
              'learning_rate' : [0,1, 0.3, 0.5, 0.7] # Pace of learning
              }
# Create an object gridsearch
grid_search = GridSearchCV(
    estimator=ensemble.GradientBoostingRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1)

# Model traning
%time grid_search.fit(X_train, y_train)



In [None]:
# Best Hyperparameter Values
print('Best Hyperparameter Values:', grid_search.best_params_)
print()
# # Prediction and metrics
y_train_predict = grid_search.predict(X_train)
y_test_predict = grid_search.predict(X_test)
metrics_gbr_grid = get_dict_metrics(y_train, y_train_predict, y_test, y_test_predict)
metrics_gbr_grid

In [None]:
lr = pd.Series(metrics_lr)
rf = pd.Series(metrics_rf)
rf_grid = pd.Series(metrics_rf_grid)
gbr = pd.Series(metrics_gbr)
gbr_grid = pd.Series(metrics_gbr_grid)

metrics_df = pd.concat([lr, rf, rf_grid, gbr, gbr_grid], axis=1).reset_index().rename(columns={
    'index' : 'Metric',
    0 : 'Linear regression',
    1 : 'Random forest',
    2 : 'Random forest grid',
    3 : 'Gradient boosting',
    4 : 'Gradient boosting grid'
})
metrics_df

##