Hello reader,  
  
I'm a data science beginner, and I used this dataset to train.  
  
The goal I fixed is:  Predict Sales (starting with global sales, then moving to others ?) based on Platform, Year, Genre, and Publisher.  
I will use the Mean Squared Error as scoring method.  hank
  
Feel free to comment if you have any suggestion or advices.  

Thank you !

**Todo**:
* Better plots
* shrinked linear regressor (lasso, ridge, LAR ?)
*

## Packages

In [None]:
import pandas as pd
import sklearn as sk
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_squared_error

# Data import and cleaning

In [None]:
df = pd.read_csv('../input/vgsales.csv')

In [None]:
df = df.sort_values(['Global_Sales'], ascending=False)
df = df[~df[['Name', 'Platform', 'Year']].duplicated(keep='first')]

In [None]:
df = df[~df.isnull()['Year']]

In [None]:
len(df)

# Features and Target

In [None]:
df_features = pd.get_dummies(df.Platform)
df_features = df_features.join(pd.get_dummies(df.Genre), how='outer', lsuffix='_left', rsuffix='_right')
df_features = df_features.join(pd.get_dummies(df.Publisher), how='outer', lsuffix='_left', rsuffix='_right')
df_features['Year'] = df['Year']
len(df_features)

In [None]:
X = np.array(df_features)

In [None]:
y = np.array(df.Global_Sales)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models testing

### Simple Linear Regression

In [None]:
lr1 = LinearRegression()

In [None]:
lr1_scores = cross_val_score(lr1, X, y, 
                             scoring='neg_mean_squared_error',
                             cv=10,
                             n_jobs=-1,
                             verbose=1)

In [None]:
lr1_scores

### Simple Random Forest

In [None]:
rf1 = RandomForestRegressor(n_estimators=10)

In [None]:
rf1_scores = cross_val_score(rf1, X, y, 
                             scoring='neg_mean_squared_error',
                             cv=10,
                             n_jobs=-1,
                             verbose=1)

In [None]:
rf1_scores

### Tuned Random Forest

In [None]:
rf_params = {
    'n_estimators': [50, 200, 400],
    'min_samples_leaf': [2, 5, 10],
    'max_features': ['auto', 'sqrt']
}

rf_gs = GridSearchCV(RandomForestRegressor(),
                     param_grid=rf_params,
                     scoring='neg_mean_squared_error',
                     n_jobs=-1,
                     verbose=2,
                     cv = 10)

I tried more hyperparams than that, here are the most promising.  
Since the dataset is really small, grid search is not that bad and let us try a lot of different stuff.  

In [None]:
rf_gs.fit(X, y)

In [None]:
rf_gs.best_params_

In [None]:
rf2 = rf_gs.best_estimator_

In [None]:
rf2_scores = cross_val_score(rf2, X, y, 
                             scoring='neg_mean_squared_error',
                             cv=10,
                             n_jobs=-1,
                             verbose=1) 

In [None]:
rf2_scores

# Score Board

In [None]:
model_names = ['simple LR', 
               'simple RF',
               'tuned RF']
model_scores = [lr1_scores,
                rf1_scores,
                rf2_scores]
results = pd.DataFrame(model_scores, index=model_names).transpose()

results.describe()

In [None]:
results.boxplot()
plt.show()

In [None]:
results.boxplot()
axes = plt.gca()
axes.set_ylim([-30,0])
plt.show()

In [None]:
results.boxplot()
axes = plt.gca()
axes.set_ylim([-0.5,0])
plt.show()

## DRAFT

In [None]:
np.percentile(lr1_scores, 50)

In [None]:
df_features.Year.unique()