## Setup and data read

In [None]:
# Imports and parameter setting
from pyspark.sql import SparkSession,  Row
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.linear_model import ElasticNet, ElasticNetCV
import pandas as pd
import numpy as np
import cPickle as pickle

pd.set_option("display.max_columns", 100)

inputPath = '/home/jovyan/work/data/autot4.7.csv'

# Create a spark session
session = SparkSession \
    .builder \
    .appName("Car data") \
    .config('spark.driver.memory', '5G') \
    .config('spark.executor.memory', '5G') \
    .getOrCreate()
    


In [None]:
# Read input data into a spark data frame
# remove . from column names
inDf = session.read \
    .format("org.apache.spark.csv") \
    .option("header", "true") \
    .option("delimiter", ";") \
    .option("mode", "DROPMALFORMED") \
    .csv(inputPath)
    
newColnames = [col.replace('.','_',5) for col in inDf.columns]
inDf = inDf.toDF(*newColnames)

In [None]:
inDf.printSchema()

In [None]:
inDf.first().asDict()

In [None]:
#Select a subset of columns and set their types 
carsDf = inDf.select(
    'ajoneuvoluokka',
    'ajoneuvonkaytto',
    'korityyppi',
    'ohjaamotyyppi',
    'kayttovoima',
    'istumapaikkojenLkm',
    'sylintereidenLkm',
    'vaihteisto',
    'alue',
    'kunta',
    'merkki',
    'malli',
    'kayttoonotto_pvm_imputoitu',
    inDf['omamassa'].cast("int"),
    inDf['iskutilavuus'].cast("int"),
    inDf['suurinNettoteho'].cast("int"),
    inDf['matkamittarilukema'].cast("int"),
    inDf['kayttoonottoVuosi'].cast("int"),
    inDf['ensirekVuosi'].cast("int"),
    inDf['ensirekisterointipvm'].cast("timestamp"),
    inDf['kayttoonottopvm'].cast("timestamp"),
    inDf['max_date'].cast("timestamp"),
    inDf['kayttoonotto'].cast("timestamp")
)
# List of variables by types strings are called 'factors'
factorVars = [
    'ajoneuvoluokka',
    'ajoneuvonkaytto',
    'korityyppi',
    'ohjaamotyyppi',
    'kayttovoima',
    'istumapaikkojenLkm',
    'sylintereidenLkm',
    'vaihteisto',
    'alue',
    'kunta',
    'merkki',
    'malli',
    'kayttoonotto_pvm_imputoitu'
]
numericVars = [
    'omamassa',
    'iskutilavuus',
    'suurinNettoteho',
    'matkamittarilukema',
    'kayttoonottoVuosi',
    'ensirekVuosi'
]
dateVars = [
    'ensirekisterointipvm',
    'kayttoonottopvm',
    'max_date',
    'kayttoonotto'
]


In [None]:
carsDf.printSchema()

In [None]:
carsDf.first().asDict()

In [None]:
# We want to predict mileage (matkamittarilukema), filter to only possible values
# Persist this set to memory
carsDf = carsDf.filter(carsDf.matkamittarilukema.between(1, 1e6)).cache()

In [None]:
# Calculate a new column from originals. This is probably relevant for milage
carsDf = carsDf.withColumn(
    'usageDays', 
    (carsDf['max_date'].cast('long')-carsDf['kayttoonottopvm'].cast('long'))/(24.0 * 3600.0)
)
numericVars.append('usageDays')

## Exploration

### pro-tip
write ? after a function or method and run cell to see its documentation
```python 
carsDf.sample?
```
    

In [None]:
# take a sample into pandas data frame for plotting and stats
sample = carsDf.sample(False, 0.005).toPandas()

### Print samples & stats

In [None]:
sample.head()

In [None]:
sample.kayttoonotto.head()

In [None]:
sample.describe(include=[np.number])

In [None]:
sample.describe(include=[object])

In [None]:
sample.describe(include=[np.datetime64])

In [None]:
sample.groupby('ajoneuvonkaytto')['matkamittarilukema'].agg([np.median, np.mean, np.std])

### Some plots

In [None]:
for c in factorVars:
    fig, axes = plt.subplots()
    sample[c].value_counts().plot(ax=axes,kind='bar', alpha=0.7)
    plt.suptitle(c)

In [None]:
for c in numericVars:
    fig, axes = plt.subplots()
    sample[c].hist(ax=axes,alpha=0.7, bins=20)
    plt.suptitle(c)

In [None]:
for c in dateVars:
    fig, axes = plt.subplots()
    sample[c].hist(bins=100)
    plt.suptitle(c)

In [None]:
sample.plot.scatter(x='usageDays', y='matkamittarilukema')

## Preprocess for modeling

In [None]:
# split data into training (20%), validation (10%) and rest (70%) sets
# numbers are chosen here for convenience, 20% of this set is enought to fit model
splits = carsDf.randomSplit([0.2, 0.1, 0.7], 220274)

In [None]:
# Get modelling data to pandas data frame
modelDf = splits[0].toPandas()

In [None]:
# numeric and factor type variables are treated differently

# first pick numerical variables into training data
trainDf = modelDf[numericVars]

# remove target variable into separate vector
target = trainDf.pop('matkamittarilukema')

# numerical data has missing values, replace missing with average of that variable
# Good idea: make additional variable for denoting that the value was missing
imputer = Imputer()
XImputed = imputer.fit_transform(trainDf)

# scale numerical variables to zero mean and unit variance
scaler = StandardScaler()
XScaled = scaler.fit_transform(XImputed)

# put model fitting data into pandas data frame
X = pd.DataFrame(XScaled, columns=trainDf.columns)
X.head()

In [None]:
# factor variables are included through dummy variable encoding
# here added only one such variable, 'ajoneuvonkaytto'
tmp = pd.get_dummies(modelDf['ajoneuvonkaytto'], prefix='ajoneuvonkaytto')

# add dummy variables to fitting data
X[tmp.columns] = tmp

X.head()

### Model fitting
Fit a linear model to the data. Fitting done with elastic-net algorithm
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet.fit

In [None]:
# for now, just using the default parameters (usually not enough)
enet = ElasticNet()
enet.fit(X, target)

# print results
pd.Series(enet.coef_, index=X.columns)

In [None]:
# Plot the prediction vs. true values
plotDf = pd.DataFrame({'prediction': enet.predict(X), 'true_value': target})
zz = np.array([0, plotDf.prediction.max()])
#fig, axes = plt.subplots()
plotDf.plot.scatter(x='prediction', y='true_value')
plt.plot(zz, zz, 'r-')
plt.show()

### Save the fitted model and other relevant data

In [None]:
pickle.dump(
    {
        'model': enet, 
        'scaler': scaler, 
        'imputer': imputer, 
        'trainColumns': X.columns,
        'factorVars': factorVars,
        'numericVars': numericVars,
        'dateVars': dateVars
    }, 
    open('enet.pyobj','wb')
)