In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [None]:
df = pd.read_csv(r"/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")

In [None]:
df.head(2)

In [None]:
df.drop(['car_ID', 'CarName'], 1, inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

### a quick look tells that doornumber and cylindernumber can be changed to numerical data

In [None]:
df.doornumber.value_counts()

In [None]:
df.loc[:, 'doornumber'] = df.loc[:, 'doornumber'].map({'four':4, 'two':2})

In [None]:
df.head()

In [None]:
df.cylindernumber.value_counts()

In [None]:
df.loc[:, 'cylindernumber'] = df.loc[:, 'cylindernumber'].map({'four':4, 'six':6, 'five':5, 'twelve':12, 'eight':8, 'three':3, 'two':2})

In [None]:
df.head(2)

In [None]:
df.info()

### exploring each object column

In [None]:
df.fueltype.value_counts()

In [None]:
df.aspiration.value_counts()

In [None]:
df.enginetype.value_counts()

In [None]:
df.drivewheel.value_counts()

In [None]:
df.enginelocation.value_counts()

In [None]:
df.enginetype.value_counts()

In [None]:
df.fuelsystem.value_counts()

## one hot encoding

In [None]:
catg_cols = df.select_dtypes(include='object')

In [None]:
catg_cols

In [None]:
from sklearn.preprocessing import LabelEncoder     ### way much more elegant than get_dummies <3

le = LabelEncoder()

for col in catg_cols:
    df.loc[:, col] = le.fit_transform(df.loc[:, col])

In [None]:
df.head()

In [None]:
df.fuelsystem.value_counts()   
# previously these were strings/ categorical columns                                
                                # mpfi    1bbl
                                # idi     bbl    
                                # spdi    bbl    
                                # spfi    
                                # mfi 

In [None]:
df.enginetype.value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr = df.corr()

plt.figure(figsize=(20,15))
sns.heatmap(corr, annot=True, cmap='RdYlGn')

### dropping columns having no correaltion with price

In [None]:
df.drop(['symboling', 'doornumber', 'carbody', 'enginetype', 'stroke', 'compressionratio', 'peakrpm'], 1, inplace=True)

### IMPORTANT - looking for missing values

In [None]:
df.isnull().sum()

### splitting data

In [None]:
data = df.drop('price', 1)
targets = df['price']

# 50 percent of 205 = 102
train_data = data.iloc[:102] ############# 0 based so end at 143 @!%$#&^$&!@$
train_targets = targets.iloc[:102]

# 20 percent of 205 = 41
validation_data = data.iloc[102: 143]
validation_targets = targets.iloc[102: 143]

# 30 percent for test
# 205 - 143 = 62
test_data = data.iloc[143:]
test_targets = targets.iloc[143:]

# cross checking
print('Data, Targets', end='\n\n')
print('TRAIN')
print(train_data.shape)
print(train_targets.shape)
print('#'*15)

print('VALIDATION')
print(validation_data.shape)
print(validation_targets.shape)
print('#'*15)

print('TEST')
print(test_data.shape)
print(test_targets.shape)

In [None]:
print(train_data.isnull().sum())
print('-'*25)
print(validation_data.isnull().sum())
print('-'*25)
print(test_data.isnull().sum())
print('-'*25)

### Normalization

In [None]:
df.shape

In [None]:
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std

validation_data-=mean
validation_data /= std

test_data-=mean
test_data /= std

sanity check

In [None]:
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)

In [None]:
print(train_data.isnull().sum())
print('-'*25)
print(validation_data.isnull().sum())
print('-'*25)
print(test_data.isnull().sum())
print('-'*25)

#### ^ found the culprint

In [None]:
df.enginelocation.value_counts() # 0 occurs 202 times, 1 occurs 3 time --- Im imputing the nans with 0s

In [None]:
train_data = train_data.fillna(0)

validation_data = validation_data.fillna(0)

test_data = test_data.fillna(0)

In [None]:
print(train_data.isnull().sum())
print('-'*25)
print(validation_data.isnull().sum())
print('-'*25)
print(test_data.isnull().sum())
print('-'*25)

In [None]:
print(train_targets.isnull().sum())
print('-'*25)
print(validation_targets.isnull().sum())
print('-'*25)
print(test_targets.isnull().sum())
print('-'*25)

## still nan values in validation

In [None]:
# train_data.drop('enginelocation', 1, inplace=True)
validation_data.drop('enginelocation', 1, inplace=True)
test_data.drop('enginelocation', 1, inplace=True)

In [None]:
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)

In [None]:
# BUILDING BASE LINE MODEL
from keras import layers
from keras import models

model = models.Sequential()

model.add(layers.Dense(10, activation='relu', input_shape=(train_data.shape[1],)))

model.add(layers.Dense(8, activation='relu'))

model.add(layers.Dense(6, activation='relu'))

model.add(layers.Dense(1))

In [None]:
# COMPILING
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])

In [None]:
# FITTING
history = model.fit(train_data, train_targets, epochs=500, batch_size=32, validation_data=(validation_data, validation_targets))

##### new day new problems - never saw nan before

https://stackoverflow.com/questions/37232782/nan-loss-when-training-regression-network

### Changing Architecture

In [None]:

from keras import layers
from keras import models
from keras import regularizers


model = models.Sequential()

model.add(layers.Dense(64, activation='relu', input_shape=(train_data.shape[1],), kernel_regularizer=regularizers.l2(0.002)))

model.add(layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.002)))

model.add(layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.002)))

model.add(layers.Dense(1))


# COMPILING
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# FITTING
history = model.fit(train_data, train_targets, epochs=250, batch_size=32, validation_data= (validation_data, validation_targets))

In [None]:
import matplotlib.pyplot as plt
loss = history.history['mae']
val_loss = history.history['val_mae']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'bo', label='Training mae')
plt.plot(epochs, val_loss, 'b', label='Validation mae')
plt.title('Training and validation mae')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()
plt.show()