In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
dataoriginal = pd.read_csv('gapminder.csv')

In [5]:
datacopy = dataoriginal[["country", "urbanrate", "incomeperperson", "internetuserate", "alcconsumption", "femaleemployrate", "lifeexpectancy", "employrate"]]
data = datacopy.copy()

In [6]:
data= data.replace(0, np.NaN)
data = data.dropna()

In [7]:
data

Unnamed: 0,country,urbanrate,incomeperperson,internetuserate,alcconsumption,femaleemployrate,lifeexpectancy,employrate
0,Afghanistan,24.04,,3.65412162280064,.03,25.6000003814697,48.673,55.7000007629394
1,Albania,46.72,1914.99655094922,44.9899469578783,7.29,42.0999984741211,76.918,51.4000015258789
2,Algeria,65.22,2231.99333515006,12.5000733055148,.69,31.7000007629394,73.131,50.5
3,Andorra,88.92,21943.3398976022,81,10.17,,,
4,Angola,56.7,1381.00426770244,9.99995388324075,5.57,69.4000015258789,51.093,75.6999969482422
...,...,...,...,...,...,...,...,...
208,Vietnam,27.84,722.807558834445,27.8518215557703,3.91,67.5999984741211,75.181,71
209,West Bank and Gaza,71.9,,36.4227717919075,,11.3000001907349,72.832,32
210,"Yemen, Rep.",30.64,610.3573673206,12.3497504635596,.2,20.2999992370605,65.493,39
211,Zambia,35.42,432.226336974583,10.124986462443,3.56,53.5,49.025,61


In [8]:
data.dtypes

country             object
urbanrate           object
incomeperperson     object
internetuserate     object
alcconsumption      object
femaleemployrate    object
lifeexpectancy      object
employrate          object
dtype: object

In [9]:
data.isnull().sum()

country             0
urbanrate           0
incomeperperson     0
internetuserate     0
alcconsumption      0
femaleemployrate    0
lifeexpectancy      0
employrate          0
dtype: int64

In [10]:
data["internetuserate"] = pd.to_numeric(data["internetuserate"], errors='coerce')
data['alcconsumption'] = pd.to_numeric(data['alcconsumption'], errors='coerce')
data['incomeperperson'] = pd.to_numeric(data['incomeperperson'], errors='coerce')
data['urbanrate'] = pd.to_numeric(data['urbanrate'], errors='coerce')
data['femaleemployrate'] = pd.to_numeric(data['femaleemployrate'], errors='coerce')
data['lifeexpectancy'] = pd.to_numeric(data['lifeexpectancy'], errors='coerce')
data['employrate'] = pd.to_numeric(data['employrate'], errors='coerce')

In [11]:
data.dtypes

country              object
urbanrate           float64
incomeperperson     float64
internetuserate     float64
alcconsumption      float64
femaleemployrate    float64
lifeexpectancy      float64
employrate          float64
dtype: object

In [14]:
#income per person
print("first values for capita GDP:")
gdp_freq = pd.concat(dict(counts = data["incomeperperson"].value_counts(sort=False, dropna=False), percentages = data["incomeperperson"].value_counts(sort=False, dropna=False, normalize=True)), axis=1)
print(gdp_freq.head(5))

first values for capita GDP:
              counts  percentages
NaN               23     0.107981
8614.120219        1     0.004695
39972.352768       1     0.004695
279.180453         1     0.004695
161.317137         1     0.004695


In [15]:

print("first values for urban rate:")
urbanrate_freq = pd.concat(dict(counts = data["urbanrate"].value_counts(sort=False, dropna=False), percentages = data["urbanrate"].value_counts(sort=False, dropna=False, normalize=True)), axis=1)
print(urbanrate_freq.head(5))


print("first values for capita GDP:")
gdp_freq = pd.concat(dict(counts = data["incomeperperson"].value_counts(sort=False, dropna=False), percentages = data["incomeperperson"].value_counts(sort=False, dropna=False, normalize=True)), axis=1)
print(gdp_freq.head(5))

print("Internet Usage: ")
internet_freq = pd.concat(dict(counts = data["internetuserate"].value_counts(sort=False, dropna=False), percentages = data["internetuserate"].value_counts(sort=False, dropna=False, normalize=True)), axis=1)
print(internet_freq.head(5))


print("employ rate: ") 
employ_freq = pd.concat(dict(counts = data["employrate"].value_counts(sort=False, dropna=False), percentages = data["employrate"].value_counts(sort=False, dropna=False, normalize=True)), axis=1)
print(employ_freq.head(5))


first values for urban rate:
       counts  percentages
92.0        1     0.004695
100.0       6     0.028169
74.5        1     0.004695
NaN        10     0.046948
73.5        1     0.004695
first values for capita GDP:
              counts  percentages
NaN               23     0.107981
8614.120219        1     0.004695
39972.352768       1     0.004695
279.180453         1     0.004695
161.317137         1     0.004695
Internet Usage: 
           counts  percentages
81.000000       1     0.004695
66.000000       1     0.004695
45.000000       1     0.004695
NaN            21     0.098592
2.100213        1     0.004695
employ rate: 
      counts  percentages
50.5       1     0.004695
NaN       35     0.164319
61.5       3     0.014085
46.0       2     0.009390
64.5       1     0.004695


In [16]:
print('Income per person in categories')
data['incomelabel'] =pd.cut(data.incomeperperson,4,labels=['low','medium','high','very high'])
income_freq = pd.concat(dict(counts = data["incomelabel"].value_counts(sort=False, dropna=False),
                                   percentages = data["incomelabel"].value_counts(sort=False, dropna=False,
                                                                                       normalize=True)),
                            axis=1)
print("Frequency distribution - income per person:\n", income_freq)


print('Countries with high and very high GDP')
highincome = data[(data['incomelabel'] == 'high') | (data['incomelabel'] == 'very high') ]
print(highincome.loc[:, ['country', 'incomeperperson', 'incomelabel']].sort_values(by='incomelabel', ascending=False))

Income per person in categories
Frequency distribution - income per person:
            counts  percentages
low           169     0.793427
medium         18     0.084507
high            1     0.004695
very high       2     0.009390
NaN            23     0.107981
Countries with high and very high GDP
           country  incomeperperson incomelabel
109  Liechtenstein     81647.100031   very high
127         Monaco    105147.437697   very high
20         Bermuda     62682.147006        high


In [19]:
#added new continent column; calculate mean variables

print('Countries by country')
country_counts = data.groupby('country').size()
print(country_counts)
print('\n')

print('GDP Statistics by country')
gdp_mean = data.groupby('country')['incomeperperson'].agg([np.mean, np.median, len])
print(gdp_mean)

print('Urban rate by country')
urban_mean = data.groupby('country')['urbanrate'].agg([np.mean, np.median, len])
print(urban_mean)


print('Urban rate by country')
urban_mean = data.groupby('country')['internetuserate'].agg([np.mean, np.median, len])
print(urban_mean)

Countries by country
country
Afghanistan           1
Albania               1
Algeria               1
Andorra               1
Angola                1
                     ..
Vietnam               1
West Bank and Gaza    1
Yemen, Rep.           1
Zambia                1
Zimbabwe              1
Length: 213, dtype: int64


GDP Statistics by country
                            mean        median  len
country                                            
Afghanistan                  NaN           NaN  1.0
Albania              1914.996551   1914.996551  1.0
Algeria              2231.993335   2231.993335  1.0
Andorra             21943.339898  21943.339898  1.0
Angola               1381.004268   1381.004268  1.0
...                          ...           ...  ...
Vietnam               722.807559    722.807559  1.0
West Bank and Gaza           NaN           NaN  1.0
Yemen, Rep.           610.357367    610.357367  1.0
Zambia                432.226337    432.226337  1.0
Zimbabwe              320.771