In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv("../input/videogames-sales-dataset/Video_Games_Sales_as_at_22_Dec_2016.csv", encoding='utf-8')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

#     print(X.shape)
#     print(X)
#     print(y)
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.8, 
                    c=colors[idx],
                    marker=markers[idx], 
                    label=cl, 
                    edgecolor='black')

    # highlight test examples
    if test_idx:
        # plot all examples
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    edgecolor='black',
                    alpha=1.0,
                    linewidth=1,
                    marker='o',
                    s=100, 
                    label='test set')


In [None]:
feature_columns = ["EU_Sales", "NA_Sales"]
marker_column = ["User_Score"]
relevant_data = df[feature_columns + marker_column]
relevant_data.head()

In [None]:
#validating data
relevant_data.dropna(axis=0, inplace=True)
#relevant_data.reset_index(drop=True, inplace=True)
relevant_data.User_Score = relevant_data.User_Score.map(lambda x: 1 if x>=8.5 else 0)
relevant_data.head()

In [None]:
sns.pairplot(relevant_data)

In [None]:
train_data, test_data = train_test_split(relevant_data, test_size = 0.5, random_state = 1)

In [None]:
tree_model = DecisionTreeClassifier(criterion='gini',
                                    max_depth=None, 
                                    random_state=1)
tree_model.fit(train_data[feature_columns], train_data[marker_column])

In [None]:
print("mean accuracy on train data = {}".format(tree_model.score(train_data[feature_columns], train_data[marker_column])))
print("mean accuracy on test data = {}".format(tree_model.score(test_data[feature_columns], test_data[marker_column])))
print("depth of the tree = {}, number of leaves = {}".format(
    tree_model.get_depth(),
    tree_model.get_n_leaves()))

In [None]:
plot_decision_regions(relevant_data[feature_columns].values[0:50],
                      relevant_data[marker_column].values.reshape(1,-1)[0][0:50], 
                      classifier=tree_model,
                      test_idx=range(25, 50))

plt.xlabel('EU Sales')
plt.ylabel('NA Sales')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
forest_model = RandomForestClassifier(criterion='gini',  #mera neodnorodnosti
                                n_estimators=25,         #amount of trees to use
                                bootstrap = True,        #build random forest using bootstrap samples (viborka s povtoreniami)
                                random_state=1,
                                n_jobs=2)                #parallelize computation

forest_model.fit(train_data[feature_columns], train_data[marker_column])

In [None]:
print("mean accuracy on train data = {}".format(forest_model.score(train_data[feature_columns], train_data[marker_column])))
print("mean accuracy on test data = {}".format(forest_model.score(test_data[feature_columns], test_data[marker_column])))

In [None]:
plot_decision_regions(relevant_data[feature_columns].values[0:50],
                      relevant_data[marker_column].values.reshape(1,-1)[0][0:50], 
                      classifier=forest_model,
                      test_idx=range(25, 50))
plt.xlabel('EU Sales')
plt.ylabel('NA Sales')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()