In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [139]:
# Load
data = pd.read_csv('./../../Datasets/wisconsin_breast_cancer.csv')

In [117]:
# preserve index/country
countries = data[['Country']]

# Deleting unused columns 
data.drop(data.columns[0:2], axis=1, inplace=True)


In [118]:
# Duplicates
data.duplicated().sum()

0

In [119]:
# Missing values
data.isnull().sum().sum()

0

In [120]:
encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtypes == 'object':
        data[col] = encoder.fit_transform(data[col])

In [121]:
# data.groupby('Official language')['GDP'].agg(['count', 'max', 'min', 'mean']).sort_values(by='count', ascending=False)
# data.groupby('Official language')[['GDP', 'Population']].agg(['count', 'max', 'min', 'mean'])

In [134]:
data.head()

Unnamed: 0,Density\n(P/Km2),Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Co2-Emissions,CPI,CPI Change (%),Fertility Rate,Forested Area (%),Gasoline Price,GDP,Gross primary education enrollment (%),Gross tertiary education enrollment (%),Infant mortality,Life expectancy,Maternal mortality ratio,Minimum wage,Official language,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,60.0,58.1,652230.0,323000.0,32.49,93.0,8672.0,149.9,2.3,4.47,2.1,0.7,19101350000.0,104.0,9.7,47.9,64.5,638.0,0.43,50,78.4,0.28,38041754.0,48.9,9.3,71.4,11.12,9797273.0,33.93911,67.709953
1,105.0,43.1,28748.0,9000.0,11.78,355.0,4536.0,119.05,1.4,1.62,28.1,1.36,15278080000.0,107.0,55.0,7.8,78.5,15.0,1.12,1,56.9,1.2,2854191.0,55.7,18.6,36.6,12.33,1747593.0,41.153332,20.168331
2,18.0,17.4,2381741.0,317000.0,24.28,213.0,150006.0,151.36,2.0,3.02,0.8,0.28,169988200000.0,109.9,51.4,20.1,76.7,112.0,0.95,3,28.1,1.72,43053054.0,41.2,37.2,66.1,11.7,31510100.0,28.033886,1.659626
3,164.0,40.0,468.0,31000.0,7.2,376.0,469.0,125.34,2.3,1.27,34.0,1.51,3154058000.0,106.4,31.2,2.7,73.2,53.0,6.63,10,36.4,3.33,77142.0,62.45,16.3,37.2,5.36,67873.0,42.506285,1.521801
4,26.0,47.5,1246700.0,117000.0,40.73,244.0,34693.0,261.73,17.1,5.52,46.3,0.97,94635420000.0,113.5,9.3,51.6,60.8,241.0,0.71,53,33.4,0.21,31825295.0,77.5,9.2,49.1,6.89,21061025.0,-11.202692,17.873887


In [122]:
matrix = data.corr()
matrix.style.background_gradient(cmap='viridis')

Unnamed: 0,Density (P/Km2),Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Co2-Emissions,CPI,CPI Change (%),Fertility Rate,Forested Area (%),Gasoline Price,GDP,Gross primary education enrollment (%),Gross tertiary education enrollment (%),Infant mortality,Life expectancy,Maternal mortality ratio,Minimum wage,Official language,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
Density (P/Km2),1.0,-0.033645,-0.053637,-0.023926,-0.147679,0.006136,-0.019218,-0.025382,-0.02964,-0.07434,-0.032696,0.224237,-0.01879,0.003602,0.019196,-0.107245,0.064183,-0.062929,0.249618,-0.054585,-0.092285,0.204563,-0.01787,0.019065,-0.01449,-0.039397,-0.055211,-0.023438,0.064664,0.031936
Agricultural Land( %),-0.033645,1.0,-0.031372,0.040478,0.195054,-0.06676,0.061898,-0.007213,-0.013782,0.176654,-0.434566,0.045731,0.051623,-0.046669,-0.109088,0.203945,-0.240095,0.201359,-0.002432,0.088901,0.131005,-0.035146,0.117165,-0.108046,-0.052135,0.15964,0.067871,0.101364,-0.034182,-0.016128
Land Area(Km2),-0.053637,-0.031372,1.0,0.558671,-0.06363,-0.23026,0.59151,0.025093,0.038532,-0.059989,-0.014244,-0.183163,0.550855,0.006032,0.223394,-0.061995,0.052961,-0.041482,0.14074,0.132014,-0.012785,0.074271,0.446891,-0.006003,-0.166227,0.092846,0.053351,0.547687,0.048496,0.01006
Armed Forces size,-0.023926,0.040478,0.558671,1.0,-0.129806,-0.18249,0.742572,0.044282,0.064723,-0.136183,-0.038531,-0.161453,0.608858,0.049768,0.114691,-0.067275,0.071427,-0.086805,-0.011705,0.166837,0.152065,0.022056,0.87723,-0.077574,-0.189545,0.095963,-0.017489,0.852839,0.125502,0.169243
Birth Rate,-0.147679,0.195054,-0.06363,-0.129806,1.0,0.069782,-0.14956,0.140209,0.105081,0.977829,-0.073472,-0.197649,-0.179423,-0.08045,-0.710592,0.866677,-0.867507,0.761649,-0.431682,-0.077468,0.252097,-0.730514,-0.051323,0.167487,-0.353261,0.196852,-0.029077,-0.1057,-0.489723,0.032407
Calling Code,0.006136,-0.06676,-0.23026,-0.18249,0.069782,1.0,-0.156016,-0.072969,-0.085789,0.025224,-0.096186,-0.180507,-0.182437,-0.035105,-0.224971,-0.031832,-0.033359,-0.131787,-0.174715,-0.088247,0.01279,-0.041659,-0.165742,0.050682,-0.04803,-0.242073,-0.113109,-0.1916,0.012193,0.193684
Co2-Emissions,-0.019218,0.061898,0.59151,0.742572,-0.14956,-0.156016,1.0,-0.014447,-0.00104,-0.136899,-0.027207,-0.068345,0.916978,0.00193,0.157411,-0.118478,0.117827,-0.100474,0.090509,0.128493,-0.028575,0.055886,0.810229,-0.013108,-0.143687,0.06371,0.012696,0.926331,0.1142,0.071017
CPI,-0.025382,-0.007213,0.025093,0.044282,0.140209,-0.072969,-0.014447,1.0,0.894924,0.13767,0.002757,-0.252502,-0.02363,-0.15917,0.003105,0.184591,-0.176028,0.29933,-0.083903,-0.003557,0.15763,-0.056019,-0.001572,0.019967,-0.056198,0.041498,0.126116,-0.005858,-0.076429,-0.029125
CPI Change (%),-0.02964,-0.013782,0.038532,0.064723,0.105081,-0.085789,-0.00104,0.894924,1.0,0.094966,0.013309,-0.258681,-0.007797,-0.124938,0.050153,0.154031,-0.144924,0.222862,-0.086926,0.043776,0.133698,-0.042776,0.00907,-0.016684,-0.077064,0.114516,0.118692,0.010888,-0.089377,-0.080274
Fertility Rate,-0.07434,0.176654,-0.059989,-0.136183,0.977829,0.025224,-0.136899,0.13767,0.094966,1.0,-0.062967,-0.124847,-0.156987,-0.13929,-0.671396,0.851826,-0.846445,0.775334,-0.363421,-0.104825,0.203904,-0.66276,-0.051936,0.154608,-0.35401,0.207008,-0.060654,-0.100484,-0.444566,0.024624


In [123]:
gdp_corr = data.corrwith(data['GDP']).reset_index().rename(columns={'index': 'features', 0: 'values'})
gdp_corr['values'] = gdp_corr['values'].apply(lambda x : abs(x))
gdp_corr.sort_values(by='values', ascending=False, inplace=True)
gdp_corr.style.background_gradient(cmap='viridis')

Unnamed: 0,features,values
12,GDP,1.0
6,Co2-Emissions,0.916978
27,Urban_population,0.784549
22,Population,0.632505
3,Armed Forces size,0.608858
2,Land Area(Km2),0.550855
18,Minimum wage,0.218397
14,Gross tertiary education enrollment (%),0.214354
5,Calling Code,0.182437
4,Birth Rate,0.179423


In [124]:
# target = GDP
# features = ['Co2-Emissions', 'Urban_population', 'Armed Forces size']
X = data.drop('GDP', axis=1)
y = data['GDP']

In [125]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [133]:
def run_classifier(k_):
    classifier = KNeighborsRegressor(n_neighbors=k_, metric='euclidean', algorithm='auto')
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    print(k_, r2)
    return ''


In [130]:
[run_classifier(k) for k in range(3,30,2)]

3 0.4402074604055741
5 0.32876744276195435
7 -0.4226657059344534
9 -0.6023633299931492
11 -0.394514238760425
13 -0.49143273380165575
15 -0.3248984184398558
17 -0.2348449067694267
19 -0.2756147830025728
21 -0.22719627443811619
23 -0.1210715576903687
25 -0.08092671188846201
27 -0.06771910482432264
29 -0.09539626735542472


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [128]:
classifier = LinearRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

r2_score(y_test, y_pred)

-5.750002246870601

In [129]:
classifier = KNeighborsRegressor()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

r2_score(y_test, y_pred)



0.32876744276195435