## Setup and data read

In [None]:
# Imports and parameter setting
from pyspark.sql import SparkSession,  Row
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn import ensemble

import pandas as pd
import numpy as np
import cPickle as pickle

pd.set_option("display.max_columns", 100)

inputPath = '/home/jovyan/work/data/autot4.7.csv'

# Create a spark session
session = SparkSession \
    .builder \
    .appName("Car data") \
    .config('spark.driver.memory', '5G') \
    .config('spark.executor.memory', '5G') \
    .getOrCreate()
    


In [None]:
# Read input data into a spark data frame
# remove . from column names
inDf = session.read \
    .format("org.apache.spark.csv") \
    .option("header", "true") \
    .option("delimiter", ";") \
    .option("mode", "DROPMALFORMED") \
    .csv(inputPath)
    
newColnames = [col.replace('.','_',5) for col in inDf.columns]
inDf = inDf.toDF(*newColnames)

In [None]:
inDf.printSchema()

In [None]:
inDf.first().asDict()

In [None]:
#Select a subset of columns and set their types 
carsDf = inDf.select(
    'ajoneuvoluokka',
    'ajoneuvonkaytto',
    'ajoneuvoryhma',
    'korityyppi',
    'ohjaamotyyppi',
    'kayttovoima',
    'istumapaikkojenLkm',
    'sylintereidenLkm',
    'vaihteisto',
    'alue',
    'kunta',
    'merkki',
    'malli',
    'merkki_l_malli',
    'kayttoonotto_pvm_imputoitu',
    inDf['omamassa'].cast("int"),
    inDf['iskutilavuus'].cast("int"),
    inDf['suurinNettoteho'].cast("int"),
    inDf['matkamittarilukema'].cast("int"),
    inDf['kayttoonottoVuosi'].cast("int"),
    inDf['ensirekVuosi'].cast("int"),
    inDf['ensirekisterointipvm'].cast("timestamp"),
    inDf['kayttoonottopvm'].cast("timestamp"),
    inDf['max_date'].cast("timestamp"),
    inDf['kayttoonotto'].cast("timestamp")
)
# List of variables by types strings are called 'factors'
factorVars = [
    'ajoneuvoluokka',
    'ajoneuvonkaytto',
    'ajoneuvoryhma',
    'korityyppi',
    'ohjaamotyyppi',
    'kayttovoima',
    'istumapaikkojenLkm',
    'sylintereidenLkm',
    'vaihteisto',
    'alue',
    'kunta',
    'merkki',
    'malli',
    'merkki_l_malli',
    'kayttoonotto_pvm_imputoitu'
]
numericVars = [
    'omamassa',
    'iskutilavuus',
    'suurinNettoteho',
    'matkamittarilukema',
    'kayttoonottoVuosi',
    'ensirekVuosi'
]
dateVars = [
    'ensirekisterointipvm',
    'kayttoonottopvm',
    'max_date',
    'kayttoonotto'
]


In [None]:
carsDf.printSchema()

In [None]:
carsDf.first().asDict()

In [None]:
# We want to predict mileage (matkamittarilukema), filter to only possible values
# Persist this set to memory
carsDf = carsDf.filter(carsDf.matkamittarilukema.between(1, 1e6)).cache()

In [None]:
# Calculate a new columns from originals.
carsDf = carsDf.withColumn(
    'usageDays', 
    (carsDf['max_date'].cast('long')-carsDf['kayttoonottopvm'].cast('long'))/(24.0 * 3600.0)
)
numericVars.append('usageDays')

carsDf = carsDf.withColumn(
    'mileagePerDay', 
    carsDf['matkamittarilukema'].cast('float')/carsDf['usageDays']
)
numericVars.append('mileagePerDay')

## Exploration

### pro-tip
write ? after a function or method and run cell to see its documentation
```python 
carsDf.sample?
```
    

In [None]:
# take a sample into pandas data frame for plotting and stats
sample = carsDf.sample(False, 0.005).toPandas()

### Print samples & stats

In [None]:
sample.head()

In [None]:
sample.kayttoonotto.head()

In [None]:
sample.describe(include=[np.number])

In [None]:
sample.describe(include=[object])

In [None]:
sample.describe(include=[np.datetime64])

In [None]:
sample.groupby('ajoneuvonkaytto')['mileagePerDay'].agg([np.median, np.mean, np.std])

### Some plots

In [None]:
fig, axes = plt.subplots(len(factorVars), figsize=(15,len(factorVars)*5))
for i, c in enumerate(factorVars):
    sns.countplot(y=c, data=sample, ax=axes[i])
    #plt.show()
    #sample[c].value_counts().plot(ax=axes,kind='bar', alpha=0.7)
    #plt.suptitle(c)

In [None]:
fig, axes = plt.subplots(len(factorVars), figsize=(15,len(factorVars)*5))
for i, c in enumerate(factorVars):
    sns.boxplot(y=c, x='mileagePerDay', orient='h', data=sample, ax=axes[i])


In [None]:
fig, axes = plt.subplots(len(factorVars), figsize=(15,len(factorVars)*5))
for i, c in enumerate(factorVars):
    sns.boxplot(y=c, x='mileagePerDay', orient='h', data=sample, ax=axes[i])


In [None]:
for c in numericVars:
    fig, axes = plt.subplots()
    sns.distplot(sample[c].dropna())
    plt.show()

In [None]:
for c in numericVars:
    sns.lmplot(x=c, y='mileagePerDay', data=sample, robust=False)
    plt.show()


In [None]:
for c in dateVars:
    fig, axes = plt.subplots()
    sample[c].hist(bins=100, alpha=0.7)
    plt.suptitle(c)

In [None]:
sns.jointplot(x='usageDays', y='matkamittarilukema', data=sample, kind='reg')