# Get the dataset
wget -p https://github.com/otacke/udacity-machine-learning-engineer/blob/master/submissions/capstone_project/data/Video_Games_Sales_as_at_22_Dec_2016.csv

## Move it to the proper location:
~/Downloads/github.com/otacke/udacity-machine-learning-engineer/blob/master/submissions/capstone_project/data$ mv Video_Games_Sales_as_at_22_Dec_2016.csv ~/anaconda3/envs/glmnet/input

## Description of the dataset:
https://www.kaggle.com/gregorut/videogamesales
## Great example
https://www.kaggle.com/ignacioch/predicting-vg-hits-1-million-sales-with-lr-rfc


In [None]:
%matplotlib inline
import sys,scipy,joblib,importlib,pprint,matplotlib.pyplot as plt,warnings,glmnet_python,pandas as pd \
,numpy as np,seaborn as sns, random
from glmnet import glmnet; from glmnetPlot import glmnetPlot
from glmnetPrint import glmnetPrint; from glmnetCoef import glmnetCoef; from glmnetPredict import glmnetPredict
from cvglmnet import cvglmnet; from cvglmnetCoef import cvglmnetCoef
from cvglmnetPredict import cvglmnetPredict;from cvglmnetPlot import cvglmnetPlot; 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import metrics
from math import sqrt

# Loading the dataset as a dataframe

## use meaningfull names for your dataframes 

In [None]:
videogames_2016 = pd.read_csv('~/anaconda3/envs/glmnet/input/Video_Games_Sales_as_at_22_Dec_2016.csv', encoding="utf-8")
videogames_2016.head(5)

In [None]:
# Since total sales is accumilated by the splitted sales columns in the dataframe, first it is best to remove these,
# the name of the game can be seen as a unique identifier 
#, (although a marketing startegy can be build on a plumber an his brother(s)) for now it's dropped from the frame
# and to rename the Global_Sales column to target.

In [None]:
# dropping columns from the dataframe that have no added value
cols=['Name','NA_Sales','EU_Sales','JP_Sales','Other_Sales']
videogames_2016=videogames_2016.drop(cols,axis=1)
videogames_2016=videogames_2016.rename(index=str,columns={'Global_Sales':'target'})

# check for missing values in dataframe and handle them

In [None]:
print('the check for missing values in this dataframe returns:'+str(videogames_2016.isnull().values.any()))
print('in this dataframe a total of '+str(videogames_2016.isnull().sum().sum())+' NaN values are present')
print('in the table below the numbers of NaN values per column are listed ''\n'+str(videogames_2016.isnull().sum()))

## Since this is a tutorial on GLMNET, let's drop all the rows that have any NaN from the original dataframe

In [None]:
videogames_2016=videogames_2016.dropna()
print('the check for missing values in this dataframe returns:'+str(videogames_2016.isnull().values.any()))

In [None]:
for col in videogames_2016.columns:
    if videogames_2016[col].dtype==np.float64 or videogames_2016[col].dtype==np.int64:
        print('column '+col+' is numeric type')
    else:
        print('column '+col+' is string type')

## Data wrangling: correct data type of column user_score to be of type numeric.

In [None]:
print('before the type conversion, the data type of User_Score = '+str(videogames_2016['User_Score'].dtype))
videogames_2016['User_Score']=videogames_2016['User_Score'].astype('float64')
print('after the type conversion, the data type of User_Score = '+str(videogames_2016['User_Score'].dtype))

In [None]:
print( videogames_2016.describe())

In [None]:
dummies=['Platform','Genre','Publisher','Developer','Rating']
videogames_2016_d=pd.get_dummies(videogames_2016,columns=dummies,drop_first=True)

In [None]:
videogames_2016_d.sample(5)

# Create train- and test set

In [None]:
x=videogames_2016_d.loc[:,videogames_2016_d.columns != 'target']
y=videogames_2016_d.loc[:,videogames_2016_d.columns == 'target']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=34)

# Feature scaling

In [None]:
scale=list(set(videogames_2016.columns)-set(dummies))
scale.remove('target')
unscale=list(set(videogames_2016_d)-set(scale))
unscale.remove('target')
sc_x=StandardScaler()
scaled_vars_train=pd.DataFrame(sc_x.fit_transform(x_train[scale]),index=x_train.index,columns=x_train[scale].columns)
unscaled_vars_train=pd.DataFrame(x_train[unscale],index=x_train.index,columns=x_train[unscale].columns)
x_train=scaled_vars_train.join(unscaled_vars_train,how='inner')
scaled_vars_test=pd.DataFrame(sc_x.transform(x_test[scale]),index=x_test.index,columns=x_test[scale].columns)
unscaled_vars_test=pd.DataFrame(x_test[unscale],index=x_test.index,columns=x_test[unscale].columns)
x_test=scaled_vars_test.join(unscaled_vars_test,how='inner')
x_df=x_train.copy()

# Creating numpy arrays for model usage

In [None]:
x_train=x_train.copy().values
y_train=y_train.copy().values
x_test=x_test.copy().values
y_test=y_test.copy().values

# Baseline model: Linear regression

In [None]:
lin_reg = LinearRegression().fit(x_train, y_train)
r_sq = lin_reg.score(x_train,y_train)
r_sq_t = lin_reg.score(x_test,y_test)
y_pred = lin_reg.predict(x_test)
print('Coefficient of determination: Training Set',r_sq)
print('Coefficient of determination: Test Set',r_sq_t)
print('Root Mean Squared Error of baseline model on test set:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Building reguralized regression models using GLMNET

At first, the default level of alpha (1.0) is used. This means LASSO regression is performed. There are 100 different levels of Lambda tries out, that will eventually set all coefficients from the original Ordinairy Least Squares (OLS, or the benchmark model) to 0.

In [None]:
fit=glmnet(x=x_train,y=y_train)
plt.figure(figsize=(16,22))
plt.rcParams.update({'font.size':22})
glmnetPlot(fit,xvar='lambda',label=True)

# Fit a standard cross-validated GLMNET. (cross-validated LASSO) 

In [None]:
cvfit=cvglmnet(x=x_train,y=y_train,family='gaussian',ptype='mse')

In [None]:
warnings.filterwarnings('ignore')
plt.figure(figsize=(16,12))
cvglmnetPlot(cvfit)
warnings.filterwarnings('default')

From the plot above. what level of Lambda has been used to generate the absolute lowest cross-validated error? This is the left dashed blue line.

In [None]:
lambda_min=cvfit['lambda_min']
print(f'Out of the cross-validated GLMNET procedure the absolute lowest error is obtained with Lambda:{lambda_min}.')

In [None]:
lambda_1se=cvfit['lambda_1se']
print(f'''Out of the cross-validated GLMNET procedure the lowest error within 1 standard error is obtained with
      Lambda:{lambda_1se}.This value yields the most parsimonious model.''')

# Using grid search on cross-validated GLMNET procedures

Note that GLMNET does NOT search for values of alpha. A specific value should be supplied, else alpha=1.0 is the used default. If users want to cross-validate alpha also, they should call cv.glmnet with a pre-computed vector: foldid and use this in seperate calls to cv.glmnet with different levels of alpha.

In [None]:
alpha=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
foldid=scipy.random.choice(10,size=y_train.shape[0],replace=True)

In [None]:
models=dict([(f'glm{alpha}',cvglmnet(x=x_train,
                                    y=y_train,
                                    foldid=foldid,
                                    family='gaussian',
                                    ptype='mse',
                                    alpha=alpha,
                                    parallel=True))for alpha in alpha])

# Plot training set performances

For the 3 most distinctive levels of alpha, plot all the levels of lambda (100), with the associated error rate from the grid.

In [None]:
plt.figure(figsize=(16,12))
plt.plot(scipy.log(models['glm0.0']['lambdau']),models['glm0.0']['cvm'],'r')
plt.plot(scipy.log(models['glm0.5']['lambdau']),models['glm0.5']['cvm'],'g')
plt.plot(scipy.log(models['glm1.0']['lambdau']),models['glm1.0']['cvm'],'b')
plt.xlabel(('log(Lambda)'))
plt.ylabel(models['glm1.0']['name'])
plt.legend(('alpha = 0 or RIDGE', 'alpha=0.5 or Elastic Net', 'alpha=1 or LASSO'), loc='upper left',prop={'size':12})

In [None]:
random.seed(34)

predictions=dict(
    [(model_name,
    pd.Series(cvglmnetPredict(model,
                             newx=x_test,
                             s='lambda_1se',
                             ptype='link')[:,0],
             name='Prediction'))
    for model_name, model in models.items()])

In [None]:
random.seed(34)

predictions=dict(
    [(model_name,
    cvglmnetPredict(model,
                             newx=x_test,
                             s='lambda_1se',
                   ))
    for model_name, model in models.items()])

In [None]:
models['glm0.0']

In [None]:
models['glm0.0']


In [None]:
x_test

In [None]:
accuracies = dict(
    [(model_name, 
      mean_absolute_error(pd.Series(y_test[:,0]),predictions[model_name]))
     for model_name in models.keys()])

In [None]:
accuracies = dict(
    [(model_name, 
      sqrt(mean_squared_error(pd.Series(y_test[:,0]),predictions[model_name])))
     for model_name in models.keys()])

In [None]:
pd.DataFrame(data=[(key,RMSE) for key, RMSE in accuracies.items()], columns=['model_name','RMSE'])\
.plot.bar(x='model_name',y='RMSE',color='blue',figsize=(12,6))

In [None]:
accuracies

In [None]:
best_alpha = (min(accuracies,key=accuracies.get))
best_RMSE = (min(accuracies.values()))
print(f'The best model is {best_alpha} and scores a RMSE of {round(best_RMSE,2)} on the test set')

# Show best model details

In [None]:
best_coeffs = cvglmnetCoef(models[best_alpha],s='lambda_min')
intercept = best_coeffs[0][0]
varsnames = list(x_df.columns.values)

In [None]:
model_selected = pd.DataFrame(
    data=list(zip(['intercept']+list(x_df.columns),[float(coeff) for coeff in best_coeffs])),
    columns=['feature','beta'])
pd.set_option('display.max_rows',len(model_selected))
model_selected.reindex(model_selected.beta.abs().sort_values(ascending=False).index)

In [None]:
pd.reset_option('display.max_rows')