In [161]:
import pandas as pd
import re
import numpy as np
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from itertools import permutations
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import RFE
from scipy.stats import kstest
import random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from pprint import pprint
%run plotNotebook.ipynb
sns.set_style('darkgrid')

ERROR:root:File `'plotNotebook.ipynb.py'` not found.


In [162]:
df = pd.read_csv("Country_data.csv", na_values="")
df = df.drop(['Country Name.1', 'Country Code', 'Capital City', 'Latitude of Capital', 'Longitude of Capital'], axis=1)
temp_df = df.copy()

In [163]:
cat_attr = ['Country Name', 'Government Type']
date_attr = ['Date of Founding/Independence']
num_attr = []
for col in list(df.columns):
    if col not in cat_attr and col not in date_attr:
        num_attr.append(col)

In [164]:
#cleaning numerical_attr
def clean_numerical_attr(string):
    num_regex = '([\d,]+(\.[\d]+)?)'
    a = re.search(num_regex, string)
    if a!= None:
        num = float(a.group().replace(',',''))
        return num
    else:
        return np.nan
for col in num_attr:
    try:
        df[col] = df[col].apply(clean_numerical_attr)
    except Exception as e:
        print(col)
        break



In [165]:
#cleaning date attributes
def clean_date_attr(string):
    if string == 'unknown' or string=='none':
        return np.nan
    x = list(map(int,string.split('/')))
    return int(x[-1])
df[date_attr[0]] = df[date_attr[0]].apply(clean_date_attr)


In [166]:
# as 'Date of Founding/Independence' is converted to years since 1143, it can be treated as numerical attribute
num_attr+= date_attr

In [167]:
knn_imputer = KNNImputer(n_neighbors=5)
knn_imputer.fit(df[num_attr])
df.loc[:,num_attr] = knn_imputer.transform(df[num_attr])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 75 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Country Name                                         197 non-null    object 
 1   Government Type                                      197 non-null    object 
 2   Date of Founding/Independence                        197 non-null    float64
 3   Telephones - fixed lines                             197 non-null    float64
 4   Telephones - mobile cellular                         197 non-null    float64
 5   Internet users                                       197 non-null    float64
 6   Broadband - fixed subscriptions                      197 non-null    float64
 7   GDP (purchasing power parity)                        197 non-null    float64
 8   GDP - real growth rate                               197 non-null    f

In [168]:
# normalization of data
scaler = preprocessing.StandardScaler()
unscaled_df = df.copy()
df.loc[:,num_attr] = scaler.fit_transform(df.loc[:,num_attr])

In [169]:
df.head()

Unnamed: 0,Country Name,Government Type,Date of Founding/Independence,Telephones - fixed lines,Telephones - mobile cellular,Internet users,Broadband - fixed subscriptions,GDP (purchasing power parity),GDP - real growth rate,GDP - per capita (PPP),...,Life expectancy at birth,Total fertility rate,Obesity - adult prevalence rate,Children under the age of 5 years underweight,Education expenditures,"Unemployment, youth ages 15-24",Airports,Railways,Roadways,Merchant marine
0,Afghanistan,Islamic republic,0.0946,-0.269395,-0.11159,-0.191951,-0.177938,-0.239865,-0.275012,-0.812767,...,-2.665706,1.939557,-1.290105,1.485647,-0.260294,0.008387,-0.165322,-0.227742,-0.22637,-0.433223
1,Albania,parliamentary democracy,0.05696,-0.262247,-0.255318,-0.243405,-0.168199,-0.253515,-0.067479,-0.364676,...,0.813095,-0.885522,0.194206,-1.255693,-0.319507,1.326846,-0.203578,-0.251482,-0.270511,-0.396679
2,Algeria,republic,0.325819,-0.101538,0.070913,0.000137,-0.067115,-0.011053,-0.520278,-0.249453,...,0.62931,0.034647,0.716463,-1.080714,0.154193,0.755206,-0.053497,-0.119621,-0.127848,-0.356075
3,Andorra,parliamentary democracy,-3.352178,-0.273874,-0.27936,-0.274219,-0.177323,-0.266856,-0.576878,1.231379,...,1.37758,-0.97431,0.55154,0.410109,-0.793207,0.384563,-0.186706,-0.269109,-0.27568,-0.153582
4,Angola,republic,0.395723,-0.267037,-0.186197,-0.233745,-0.175119,-0.189188,-0.312745,-0.607926,...,-1.549864,2.803224,-1.04272,0.78573,-0.674782,2.018345,-0.03486,-0.164468,-0.239064,-0.401093


In [170]:
from collections import defaultdict
from pprint import pprint
gov_type = defaultdict(int)
for type in df['Government Type']:
    gov_type[type] += 1
gov_type = sorted(list(zip(list(gov_type.values()), list(gov_type.keys()))), reverse= True)
pprint(gov_type)

[(75, 'republic'),
 (25, 'parliamentary democracy'),
 (14, 'constitutional monarchy'),
 (9, 'federal republic'),
 (7, 'parliamentary republic'),
 (4, 'constitutional republic'),
 (4, 'constitutional democracy'),
 (4, 'Communist state'),
 (3, 'monarchy'),
 (3, 'constitutional parliamentary democracy'),
 (3, 'constitutional government'),
 (2, 'republic; parliamentary democracy'),
 (2, 'parliamentary democracy and a Commonwealth realm'),
 (2, 'multiparty democracy'),
 (2, 'federal democratic republic'),
 (2, 'democratic republic'),
 (1, 'transitional government '),
 (1, 'theocratic republic'),
 (1, 'secular democracy'),
 (1, 'republican parliamentary democracy'),
 (1,
  'republic; authoritarian presidential rule,  with little power outside the '
  'executive branch'),
 (1, 'republic under transition to multiparty democratic rule'),
 (1, 'republic under an authoritarian regime'),
 (1, 'republic '),
 (1, 'parliamentary monarchy'),
 (1, 'parliamentary government'),
 (1, 'parliamentary democr

In [171]:
majorith_gov_types = [type for _, type in gov_type[:3]]
for i in range(df.shape[0]):
    if df['Government Type'][i] not in majorith_gov_types:
        df['Government Type'][i] = 'other'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [172]:
df[num_attr].corr()['GDP (purchasing power parity)'].sort_values(ascending= False)[20:30]

Population                                0.816158
Market value of publicly traded shares    0.781407
Reserves of foreign exchange and gold     0.775301
Current account balance                   0.742576
Natural gas - consumption                 0.734835
Natural gas - imports                     0.684879
Refined petroleum products - exports      0.613245
Debt - external                           0.600026
Airports                                  0.595120
Refined petroleum products - imports      0.594220
Name: GDP (purchasing power parity), dtype: float64

In [173]:
df[num_attr].corr()['GDP - per capita (PPP)'].sort_values(ascending= False)[:10]

GDP - per capita (PPP)                          1.000000
Life expectancy at birth                        0.651761
Median age                                      0.641719
Refined petroleum products - imports            0.343519
Refined petroleum products - exports            0.333918
Stock of direct foreign investment - abroad     0.329902
Stock of direct foreign investment - at home    0.322280
Debt - external                                 0.312186
Gross national saving                           0.311307
Natural gas - exports                           0.302406
Name: GDP - per capita (PPP), dtype: float64

In [174]:
# to reduce multicorrelation
selected_attributes = set(['Electricity - production', 'Crude oil - imports', 'Exports', 'Population', 
                       'Market value of publicly traded shares', 'Government Type','GDP (purchasing power parity)'])
to_be_dropped = set(df.columns).difference(selected_attributes)
print(len(to_be_dropped), len(df.columns))

68 75


In [175]:
df['Government Type'] = df['Government Type'].astype('category')

In [176]:
df = df.drop(to_be_dropped, axis=1)

In [177]:
df.head()

Unnamed: 0,Government Type,GDP (purchasing power parity),Market value of publicly traded shares,Exports,Electricity - production,Crude oil - imports,Population
0,other,-0.239865,-0.168417,-0.341001,-0.224548,-0.260769,-0.023797
1,parliamentary democracy,-0.253515,-0.169641,-0.340544,-0.213369,-0.260769,-0.252145
2,republic,-0.011053,-0.145635,-0.209638,-0.10067,-0.254655,0.024306
3,parliamentary democracy,-0.266856,-0.170242,-0.343759,-0.226645,-0.260769,-0.273426
4,republic,-0.189188,-0.16779,-0.214723,-0.207594,-0.260769,-0.056634


In [178]:
onehot_enc_gov_type = pd.get_dummies(df['Government Type'], prefix= 'gov_type')

In [179]:
onehot_enc_gov_type.head()

Unnamed: 0,gov_type_constitutional monarchy,gov_type_other,gov_type_parliamentary democracy,gov_type_republic
0,0,1,0,0
1,0,0,1,0
2,0,0,0,1
3,0,0,1,0
4,0,0,0,1


In [180]:
df = df.drop(['Government Type'], axis= 1)
df = df.join(onehot_enc_gov_type)
df.head()

Unnamed: 0,GDP (purchasing power parity),Market value of publicly traded shares,Exports,Electricity - production,Crude oil - imports,Population,gov_type_constitutional monarchy,gov_type_other,gov_type_parliamentary democracy,gov_type_republic
0,-0.239865,-0.168417,-0.341001,-0.224548,-0.260769,-0.023797,0,1,0,0
1,-0.253515,-0.169641,-0.340544,-0.213369,-0.260769,-0.252145,0,0,1,0
2,-0.011053,-0.145635,-0.209638,-0.10067,-0.254655,0.024306,0,0,0,1
3,-0.266856,-0.170242,-0.343759,-0.226645,-0.260769,-0.273426,0,0,1,0
4,-0.189188,-0.16779,-0.214723,-0.207594,-0.260769,-0.056634,0,0,0,1


### Regression

In [181]:
predictor_variables = ['Electricity - production', 'Crude oil - imports', 'Exports', 'Population', 
                       'Market value of publicly traded shares','gov_type_constitutional monarchy','gov_type_other',
                       'gov_type_parliamentary democracy','gov_type_republic']
y_colname = 'GDP (purchasing power parity)'
X_train, X_test, y_train, y_test = train_test_split(df[predictor_variables], df[y_colname], test_size= 0.33, random_state= 12)
lin_model = LinearRegression() 
lin_model.fit(X_train, y_train)
score = lin_model.score(X_train, y_train)
print(f'R^2 = {score}')

R^2 = 0.9959872956869708


In [182]:
test_pred_y = lin_model.predict(X_test)
mean_squared_error(y_test, test_pred_y)


0.00886623210489434

In [183]:
new_predictor_variables = ['Electricity - production', 'Crude oil - imports', 'Exports', 'Population', 
                       'Market value of publicly traded shares']
y_colname = 'GDP (purchasing power parity)'
X_train, X_test, y_train, y_test = train_test_split(df[new_predictor_variables], df[y_colname], test_size= 0.33, random_state= 12)
lin_model = LinearRegression() 
lin_model.fit(X_train, y_train)
score = lin_model.score(X_train, y_train)
print(f'R^2 = {score}')

R^2 = 0.9959214364854129


In [184]:
df.corr()

Unnamed: 0,GDP (purchasing power parity),Market value of publicly traded shares,Exports,Electricity - production,Crude oil - imports,Population,gov_type_constitutional monarchy,gov_type_other,gov_type_parliamentary democracy,gov_type_republic
GDP (purchasing power parity),1.0,0.781407,0.889052,0.986853,0.9248,0.816158,-0.019258,0.183404,-0.084441,-0.118428
Market value of publicly traded shares,0.781407,1.0,0.680342,0.767487,0.841014,0.380136,0.007385,0.134525,-0.061332,-0.098662
Exports,0.889052,0.680342,1.0,0.88216,0.861871,0.629325,0.034334,0.202372,-0.10297,-0.153372
Electricity - production,0.986853,0.767487,0.88216,1.0,0.899225,0.775195,-0.024578,0.174734,-0.07517,-0.113152
Crude oil - imports,0.9248,0.841014,0.861871,0.899225,1.0,0.708624,0.013081,0.14954,-0.082462,-0.10246
Population,0.816158,0.380136,0.629325,0.775195,0.708624,1.0,-0.04468,0.158732,-0.074296,-0.086841
gov_type_constitutional monarchy,-0.019258,0.007385,0.034334,-0.024578,0.013081,-0.04468,1.0,-0.236007,-0.105449,-0.216865
gov_type_other,0.183404,0.134525,0.202372,0.174734,0.14954,0.158732,-0.236007,1.0,-0.325306,-0.669017
gov_type_parliamentary democracy,-0.084441,-0.061332,-0.10297,-0.07517,-0.082462,-0.074296,-0.105449,-0.325306,1.0,-0.298921
gov_type_republic,-0.118428,-0.098662,-0.153372,-0.113152,-0.10246,-0.086841,-0.216865,-0.669017,-0.298921,1.0


In [69]:
corr = df[num_attr].corr()
print(corr['Electricity - production'].sort_values(ascending= False)[:10])
print("******************")
print(corr['Crude oil - imports'].sort_values(ascending= False)[:10])
print("********************")
print(corr['Exports'].sort_values(ascending= False)[:10])
print("********************")
print(corr['Population'].sort_values(ascending= False)[:10])
print("**********")
print(corr['Market value of publicly traded shares'].sort_values(ascending= False)[:10])

Electricity - production                               1.000000
Electricity - consumption                              0.999427
Electricity - installed generating capacity            0.998353
GDP (purchasing power parity)                          0.986853
Carbon dioxide emissions from consumption of energy    0.979609
Telephones - fixed lines                               0.960438
Stock of domestic credit                               0.949370
Broadband - fixed subscriptions                        0.934203
Internet users                                         0.921983
Refined petroleum products - consumption               0.916510
Name: Electricity - production, dtype: float64
******************
Crude oil - imports                            1.000000
Imports                                        0.925556
GDP (purchasing power parity)                  0.924800
Refined petroleum products - consumption       0.918029
Stock of domestic credit                       0.912237
Refined petrol