In [1]:
import pyspark as sp
from pyspark.sql.types import DoubleType, IntegerType

In [2]:
sp.SparkContext()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("pyspark-shell") \
    .getOrCreate()

In [4]:
data = spark.read.csv('data/vgsales.csv',inferSchema=True, header=True)

In [5]:
data.count(), len(data.columns)

(16719, 16)

In [6]:
data.show(5)

+--------------------+--------+---------------+------------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|                Name|Platform|Year_of_Release|       Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|Critic_Score|Critic_Count|User_Score|User_Count|Developer|Rating|
+--------------------+--------+---------------+------------+---------+--------+--------+--------+-----------+------------+------------+------------+----------+----------+---------+------+
|          Wii Sports|     Wii|           2006|      Sports| Nintendo|   41.36|   28.96|    3.77|       8.45|       82.53|          76|          51|         8|       322| Nintendo|     E|
|   Super Mario Bros.|     NES|           1985|    Platform| Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|        null|        null|      null|      null|     null|  null|
|      Mario Kart Wii|     Wii|           2008|      Racing|

In [7]:
data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year_of_Release: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)
 |-- Critic_Score: integer (nullable = true)
 |-- Critic_Count: integer (nullable = true)
 |-- User_Score: string (nullable = true)
 |-- User_Count: integer (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Rating: string (nullable = true)



In [8]:
data.select("Name","Platform","User_Score","User_Count").show(15, truncate=False)

+---------------------------+--------+----------+----------+
|Name                       |Platform|User_Score|User_Count|
+---------------------------+--------+----------+----------+
|Wii Sports                 |Wii     |8         |322       |
|Super Mario Bros.          |NES     |null      |null      |
|Mario Kart Wii             |Wii     |8.3       |709       |
|Wii Sports Resort          |Wii     |8         |192       |
|Pokemon Red/Pokemon Blue   |GB      |null      |null      |
|Tetris                     |GB      |null      |null      |
|New Super Mario Bros.      |DS      |8.5       |431       |
|Wii Play                   |Wii     |6.6       |129       |
|New Super Mario Bros. Wii  |Wii     |8.4       |594       |
|Duck Hunt                  |NES     |null      |null      |
|Nintendogs                 |DS      |null      |null      |
|Mario Kart DS              |DS      |8.6       |464       |
|Pokemon Gold/Pokemon Silver|GB      |null      |null      |
|Wii Fit                

In [9]:
data.describe(["User_Score","User_Count"]).show()

+-------+------------------+------------------+
|summary|        User_Score|        User_Count|
+-------+------------------+------------------+
|  count|             10015|              7590|
|   mean|7.1250461133070315|162.22990777338603|
| stddev|1.5000060936257986| 561.2823262473789|
|    min|                 0|                 4|
|    max|               tbd|             10665|
+-------+------------------+------------------+



There are some strings in the user_score "tbd"

In [10]:
data.groupBy("Platform") \
.count() \
.orderBy("count", ascending=False) \
.show(10)

+--------+-----+
|Platform|count|
+--------+-----+
|     PS2| 2161|
|      DS| 2152|
|     PS3| 1331|
|     Wii| 1320|
|    X360| 1262|
|     PSP| 1209|
|      PS| 1197|
|      PC|  974|
|      XB|  824|
|     GBA|  822|
+--------+-----+
only showing top 10 rows



Create new dataframe to remove Null values in User_Score

In [11]:
condition1 = (data.User_Score.isNotNull()) | (data.User_Count.isNotNull())
condition2 = data.User_Score != "tbd"
data2 = data.filter(condition1).filter(condition2)

#data2.show(15,False)

In [12]:
data.select("Name","Platform","User_Score","User_Count").filter(data.User_Score == "tbd").show(truncate=False)

+----------------------------------------+--------+----------+----------+
|Name                                    |Platform|User_Score|User_Count|
+----------------------------------------+--------+----------+----------+
|Zumba Fitness                           |Wii     |tbd       |null      |
|Namco Museum: 50th Anniversary          |PS2     |tbd       |null      |
|Zumba Fitness 2                         |Wii     |tbd       |null      |
|uDraw Studio                            |Wii     |tbd       |null      |
|Frogger's Adventures: Temple of the Frog|GBA     |tbd       |null      |
|Just Dance Kids                         |Wii     |tbd       |null      |
|Dance Dance Revolution X2               |PS2     |tbd       |null      |
|The Incredibles                         |GBA     |tbd       |null      |
|Who wants to be a millionaire           |PC      |tbd       |null      |
|Tetris Worlds                           |GBA     |tbd       |null      |
|Imagine: Teacher                     

In [13]:
data2.select("Name","Platform","User_Score","User_Count").describe(["User_Score","User_Count"]).show()

+-------+------------------+------------------+
|summary|        User_Score|        User_Count|
+-------+------------------+------------------+
|  count|              7590|              7590|
|   mean|7.1250461133070315|162.22990777338603|
| stddev|1.5000060936257986| 561.2823262473789|
|    min|                 0|                 4|
|    max|               9.7|             10665|
+-------+------------------+------------------+



In [14]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
data2 = data2.withColumn("Year_of_Release", data2["Year_of_Release"].cast(DoubleType()))
data2 = data2.withColumn("User_Score", data2["User_Score"].cast(DoubleType()))
data2 = data2.withColumn("User_Count", data2["User_Count"].cast(DoubleType()))
data2 = data2.withColumn("Critic_Score", data2["Critic_Score"].cast(DoubleType()))

data2 = data2.filter(data2.Year_of_Release. isNotNull())
data2 = data2.filter(data2.User_Score. isNotNull())
data2 = data2.filter(data2.User_Count. isNotNull())
data2 = data2.filter(data2.Critic_Score. isNotNull())

data2.dtypes

[('Name', 'string'),
 ('Platform', 'string'),
 ('Year_of_Release', 'double'),
 ('Genre', 'string'),
 ('Publisher', 'string'),
 ('NA_Sales', 'double'),
 ('EU_Sales', 'double'),
 ('JP_Sales', 'double'),
 ('Other_Sales', 'double'),
 ('Global_Sales', 'double'),
 ('Critic_Score', 'double'),
 ('Critic_Count', 'int'),
 ('User_Score', 'double'),
 ('User_Count', 'double'),
 ('Developer', 'string'),
 ('Rating', 'string')]

## Build a model

For an example of **linear regression**, let’s see if we can predict User_Score from Year_of_Release, Global_Sales, Critic_Score, and User_Count.

First let’s recode all of our predictors to be Doubles (I found that this got rid of some really gnarly errors later on).

#### VectorAssembler
The next step is to get our data into a form that PySpark can create a model with. To do this we use something called a VectorAssembler.

In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
#Input all the features in one vector column
assembler = VectorAssembler(inputCols=['Year_of_Release', 'Global_Sales', 'Critic_Score', 'User_Count'], outputCol = 'predictors')
output = assembler.transform(data2)
#Input vs Output
finalized_data = output.select("predictors","User_Score")
finalized_data.show(5)

+--------------------+----------+
|          predictors|User_Score|
+--------------------+----------+
|[2006.0,82.53,76....|       8.0|
|[2008.0,35.52,82....|       8.3|
|[2009.0,32.77,80....|       8.0|
|[2006.0,29.8,89.0...|       8.5|
|[2006.0,28.92,58....|       6.6|
+--------------------+----------+
only showing top 5 rows



Here we’ve delineated what features we want our model to use as predictors so that VectorAssembler can take those columns and transform them into a single column (named “predictors”) that contains all the data we want to predict with.

In [16]:
predictors = assembler.transform(data2)

predictors.columns

['Name',
 'Platform',
 'Year_of_Release',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales',
 'Critic_Score',
 'Critic_Count',
 'User_Score',
 'User_Count',
 'Developer',
 'Rating',
 'predictors']

What VectorAssembler.transform() does is create a new DataFrame with a new column at the end where each row contains a list of all the features we included in the inputCols parameter when we created the assembler.

The final step to getting our data ready to be used in a model is to collect the new predictions column we just made and User_Score (our target variable) by themselves in a DataFrame.

In [17]:
model_data = predictors.select("predictors", "User_Score")

model_data.show(5,truncate=False)

+-------------------------+----------+
|predictors               |User_Score|
+-------------------------+----------+
|[2006.0,82.53,76.0,322.0]|8.0       |
|[2008.0,35.52,82.0,709.0]|8.3       |
|[2009.0,32.77,80.0,192.0]|8.0       |
|[2006.0,29.8,89.0,431.0] |8.5       |
|[2006.0,28.92,58.0,129.0]|6.6       |
+-------------------------+----------+
only showing top 5 rows



In [18]:
from pyspark.ml.regression import LinearRegression
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])

lr = LinearRegression(
    featuresCol = 'predictors', 
    labelCol = 'User_Score')

lrModel = lr.fit(train_data)

pred = lrModel.evaluate(test_data)

pred.predictions.show(5)

+--------------------+----------+-----------------+
|          predictors|User_Score|       prediction|
+--------------------+----------+-----------------+
|[1996.0,4.63,91.0...|       8.6|9.286464320388404|
|[1997.0,0.42,91.0...|       7.8|9.317232362497577|
|[1997.0,0.5,66.0,...|       8.2|7.749412368532717|
|[1997.0,0.89,83.0...|       8.2|8.804989628478978|
|[1997.0,9.72,92.0...|       9.2| 8.95105850934172|
+--------------------+----------+-----------------+
only showing top 5 rows



### Model evaluation

In [19]:
from pyspark.ml.evaluation import RegressionEvaluator

eval = RegressionEvaluator(
    labelCol="User_Score", 
    predictionCol="prediction", 
    metricName="rmse")

In [20]:
rmse = eval.evaluate(pred.predictions)
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})

print("RMSE: " + str(rmse))
print("MSE: " + str(mse))
print("MAE: " + str(mae))
print("R squared: " + str(r2))


RMSE: 1.098788618976735
MSE: 1.2073364291928006
MAE: 0.8338376547660356
R squared: 0.4267778814047075


In [21]:
train_data.describe().show()

+-------+------------------+
|summary|        User_Score|
+-------+------------------+
|  count|              5466|
|   mean| 7.190998902305174|
| stddev|1.4367173844624845|
|    min|               0.5|
|    max|               9.6|
+-------+------------------+



This model is accounting for about 40% of the variation in the data. Can we do better?

In [22]:
lr_predictions = lrModel.transform(test_data)
lr_predictions.select("prediction","User_Score","predictors").show(5)

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="User_Score",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+-----------------+----------+--------------------+
|       prediction|User_Score|          predictors|
+-----------------+----------+--------------------+
|9.286464320388404|       8.6|[1996.0,4.63,91.0...|
|9.317232362497577|       7.8|[1997.0,0.42,91.0...|
|7.749412368532717|       8.2|[1997.0,0.5,66.0,...|
|8.804989628478978|       8.2|[1997.0,0.89,83.0...|
| 8.95105850934172|       9.2|[1997.0,9.72,92.0...|
+-----------------+----------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.426778


In [23]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)


Coefficients: [-0.07532379723839125,-0.018782217809384848,0.0626767132859347,-0.00020013853607444478]
Intercept: 154.0455647324577
numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
| -2.4213871994015337|
|  -5.197568619369082|
| -0.7680689855627811|
| -1.8499522850728285|
| -1.4978024441876467|
| -0.8195014550468507|
| 0.14077497726094634|
|  -0.690895011526397|
| -2.0198545590091666|
| -0.2547593031156339|
|-0.24926886617149613|
| -0.7732866875882793|
|-0.00161339990555...|
| -0.6931675646325566|
| -0.9011192608868779|
| 0.09104097285197099|
|0.042627029550283524|
|-0.23043285570268424|
| -0.5420660946358176|
|  0.1691145270658918|
+--------------------+
only showing top 20 rows

RMSE: 1.113400
r2: 0.399326


### Model Evaluation

In [24]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(
    labelCol="User_Score", 
    predictionCol="prediction", 
    metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)
# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)
# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 1.099
MSE: 1.207
MAE: 0.834
r2: 0.427


---

### Gradient-boosted tree regression

In [25]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(featuresCol = 'predictors', labelCol = 'User_Score', maxIter=10)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)
gbt_predictions.select('prediction', 'User_Score', 'predictors').show(5)

+-----------------+----------+--------------------+
|       prediction|User_Score|          predictors|
+-----------------+----------+--------------------+
|8.646823334993483|       8.6|[1996.0,4.63,91.0...|
|8.072095189472904|       7.8|[1997.0,0.42,91.0...|
|7.209238675790539|       8.2|[1997.0,0.5,66.0,...|
|8.343811832861526|       8.2|[1997.0,0.89,83.0...|
|8.595960222631577|       9.2|[1997.0,9.72,92.0...|
+-----------------+----------+--------------------+
only showing top 5 rows



In [26]:
gbt_evaluator = RegressionEvaluator(
    labelCol="User_Score", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbt_evaluator = RegressionEvaluator(
    labelCol="User_Score", predictionCol="prediction", metricName="r2")
rsquared = gbt_evaluator.evaluate(gbt_predictions)
print("R squared on test data = %g" % rsquared)

Root Mean Squared Error (RMSE) on test data = 1.10039
R squared on test data = 0.425106


In [27]:
gbt_predictions = gbt_model.transform(test_data)
lr_predictions.select("prediction","User_Score","predictors").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
gbt_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="User_Score",metricName="r2")
print("R Squared (R2) on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

+-----------------+----------+--------------------+
|       prediction|User_Score|          predictors|
+-----------------+----------+--------------------+
|9.286464320388404|       8.6|[1996.0,4.63,91.0...|
|9.317232362497577|       7.8|[1997.0,0.42,91.0...|
|7.749412368532717|       8.2|[1997.0,0.5,66.0,...|
|8.804989628478978|       8.2|[1997.0,0.89,83.0...|
| 8.95105850934172|       9.2|[1997.0,9.72,92.0...|
+-----------------+----------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.425106


---