In [None]:
import numpy as np 
import pandas as pd

df = pd.read_csv('../input/alcohol-consumption/gapminder_alcohol.csv')
df.head()

In [None]:
df.shape

In [None]:
df.loc[df['country'] == 'Poland']

# Which nation lead in alcohol consumption?

In [None]:
df.nlargest(10, 'alcconsumption')

In [None]:
df.describe()

In [None]:
df['alcconsumption'].plot(kind='box')

# Can we observe any relation between income and aclohol consumption?

In [None]:
df.plot(kind='scatter', x='incomeperperson', y='alcconsumption')

## Let's start preparing model 

In [None]:
#removing records with missing alcconsumption
df = df.loc[df['alcconsumption'] >= 0]

X = df.drop(columns=['country', 'alcconsumption'])
y = df['alcconsumption']

In [None]:
# train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=22)
print('Train: ', X_train.shape)
print('Test: ', X_test.shape)

In [None]:
# imputting missing values
from sklearn.impute import SimpleImputer
imp = SimpleImputer()
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.columns = X.columns
X_test.columns = X.columns

In [None]:
X_train.describe()

## Random forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=100, random_state=22, max_samples=0.7, min_samples_leaf=10)
reg.fit(X_train, y_train)
print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error
print('Train MAE: ', mean_absolute_error(y_train, reg.predict(X_train)))
print('Test MAE: ', mean_absolute_error(y_test, reg.predict(X_test)))

## Impact input variables for the predictions

In [None]:
# shap
import shap
explainer = shap.TreeExplainer(reg)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)