In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
vgsales_dirty = pd.read_csv('../input/videogamesales/vgsales.csv')


## Check dataset on null cells

In [None]:

vgsales_dirty.isnull().sum(axis = 0)

In [None]:
vgsales = vgsales_dirty.dropna(subset=['Year', 'Publisher'])
vgsales = vgsales.iloc[:, 2:]
vgsales.Year = vgsales.Year.astype(np.int64)
vgsales.isnull().any()
vgsales.head()

# Plan
1. Analize dataset
2. Try to predict missed release years of games
3. Create sales prediction 

# Let's look at the data

In [None]:
sns.catplot(x="Platform", kind="count", data=vgsales, aspect=3, order = vgsales.Platform.value_counts().index);

In [None]:
 grouped_single = vgsales.groupby(["Year", "Genre"]).agg({'Other_Sales': ['count']}).reset_index()
piv = pd.pivot_table(data=vgsales,
                    index='Year',
                    values='Other_Sales',
                    columns='Genre', aggfunc='count') 
fig, ax = plt.subplots(figsize=(20,20))
ax = sns.heatmap(piv, annot=True, fmt=".0f", square=1,ax=ax)
loc, labels = plt.yticks()
ax.set_yticklabels(rotation=0, labels=labels)
plt.title('Heatmap of genre per year', fontsize = 20) # title with fontsize 20
plt.xlabel('Genre', fontsize = 15) # x-axis label with fontsize 15
plt.ylabel('Year', fontsize = 15) # y-axis label with fontsize 15
plt.show()

In [None]:
disneyInteractiveStudios = vgsales[vgsales.Publisher == "Disney Interactive Studios"]
disneyInteractiveStudios = disneyInteractiveStudios[disneyInteractiveStudios.Platform == "X360"]
sns.catplot(x="Year", y="Genre", data=disneyInteractiveStudios, aspect=3, estimator=np.sum );
sns.catplot(x="Year", y="Platform", data=disneyInteractiveStudios, aspect=3, estimator=np.sum );
sns.catplot(x="Genre", kind="count", data=disneyInteractiveStudios, aspect=2);

In [None]:
from pandas.plotting import scatter_matrix
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
transformedCT = vgsales.iloc[:,:]
transformedCT.Platform = le.fit_transform(transformedCT.Platform)
transformedCT.Year = le.fit_transform(transformedCT.Year)
transformedCT.Genre = le.fit_transform(transformedCT.Genre)
transformedCT.Publisher = le.fit_transform(transformedCT.Publisher)
transformedCT.head()
scatter_matrix(vgsales, alpha=0.05, figsize=(20, 20));

In [None]:
transformedCT.corr()

As we see there is no correlation, except sales. Anyway, let's predict year and Global sales.

# Try to predict year

Preparing data:

In [None]:

from sklearn.model_selection import train_test_split
df = pd.get_dummies(vgsales, columns=["Genre", "Publisher", "Platform"], drop_first=True)
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'entropy', n_estimators=10, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.21973974957034126 - isn't best rusult. Expected.

# Try to predict Global Sales.

I suggest to exclude EU_Sales and JP_Sales columns because there is no sense at prediction, we can sum all sales and get Global.

In [None]:
y = vgsales.iloc[:, -1:].values
X = vgsales.drop(columns = ['EU_Sales', 'JP_Sales', 'Global_Sales']).values
X = pd.get_dummies(vgsales, columns=["Genre", "Publisher", "Platform", "Year"], drop_first=True)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train.ravel())

In [None]:
# Predicting a new result
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import explained_variance_score
explained_variance_score(y_test, y_pred)


0.7813348615339388 - not bad