# Part 2: Car Factors 

You are to construct a predictive model that provides the duration till sold for a given model.  

1. Populate the carsfactors.py following the hints in the comments
1. Integrate with carfactors_service.py
1. Test locally
1. Build requirements.txt and Dockerfile
1. Build a docker image
1. Test Locally
1. Push to docker hub
1. Populate readme for both github and docker hub (with example docker commands)
1. Populate this notebook with working output and a summary that contains an impression of the model and how to improve it.

* ***Review the [codeSamplesforCategoricalData.ipynb](./codeSamplesforCategoricalData.ipynb) for code review of the categorical data manipulations***.

In [None]:
!pip install pandas 
!pip install numpy 
!pip install scikit-learn 
!pip install basemap
!pip install flask

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from IPython.display import display, HTML

# Display Properties
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

sns.set(style="ticks", color_codes=True)

In [None]:
# The dataset has been added to this git repository and so we will read it from the current location
df = pd.read_csv("cars.csv")

# Printing the shape tells us the volume (rows) and dimensionality (columns) of the data
data_shape = df.shape
print(f'# The dataset has {data_shape[0]} rows and {data_shape[1]} columns')

In [None]:
df.head()

In [None]:
engine_null = df.loc[df.engine_capacity.isnull()]
engine_null

nissan_leaf_ec = df.loc[(df.engine_type != 'electric') ]
nissan_leaf_ec.head()

In [None]:
unique_fuel = df.engine_fuel.unique()
unique_engine = df.engine_type.unique()
unique_has_gas = df.engine_has_gas.unique()
unique_has_warranty = df.has_warranty.unique()
unique_state = df.state.unique()
unique_drive_train = df.drivetrain.unique()
unique_location = df.location_region.unique()
unique_manufacturer = df.manufacturer_name.unique()
unique_model = df.model_name.unique()
unique_body_type = df.body_type.unique()
unique_transmission = df.transmission.unique()

In [None]:
unique_transmission

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Fix the order for body types
ordered_body_types = ['universal','hatchback', 'cabriolet','coupe','sedan','liftback', 'suv', 
                      'minivan', 'van','pickup', 'minibus','limousine']
if (len(ordered_body_types) == len(unique_body_type)):
    print("We are good with body types")
else:
    print("Check if you missed any body types")

# Create the Ordinal Encoder
oe = OrdinalEncoder(categories=[ordered_body_types])

In [None]:
model_name_analysis_df = df.groupby(['manufacturer_name','model_name','body_type','engine_type','transmission','drivetrain'])['duration_listed'].count()
manufacturer_model_analysis_df = df.groupby(['manufacturer_name','model_name'])['duration_listed'].std()
manufacturer_analysis_df = df.groupby(['manufacturer_name','body_type','transmission','color'])['duration_listed'].std()

In [None]:
# First let's copy the dataframe to keep a backup
df_bak = df.copy(deep=True)

# We will drop these columns (see readme for rationale)
cols_to_drop = ['engine_fuel','engine_has_gas','engine_capacity','feature_0','feature_1','feature_2','feature_3',
               'feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','is_exchangeable', 
                'location_region','model_name','number_of_photos','up_counter']

trimmed_df = df.drop(columns=cols_to_drop,errors='ignore')
trimmed_df.tail()

In [None]:
# Let's look at missing values and confirm we are not missing anything 
display(HTML(trimmed_df.isna().sum().to_frame().to_html()))

In [None]:
# Perform the transformation on a copy of the dataframe
body_type_df = trimmed_df[['body_type']].copy()
body_type_df[['body_type']] = oe.fit_transform(body_type_df[['body_type']])
body_type_df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data starting with transmission
ohc_transmission = OneHotEncoder()
ohe_transmission = ohc_transmission.fit_transform(trimmed_df['transmission'].values.reshape(-1,1)).toarray()
df_transmission = pd.DataFrame(ohe_transmission, columns = ohc_transmission.categories_[0])

# Apply one-hot encoder to Manufacturer
ohc_manufacturer = OneHotEncoder()
ohe_manufacturer = ohc_manufacturer.fit_transform(trimmed_df['manufacturer_name'].values.reshape(-1,1)).toarray()
df_manufacturer = pd.DataFrame(ohe_manufacturer, columns = ohc_manufacturer.categories_[0])

# Apply one-hot encoder to color
ohc_color = OneHotEncoder()
ohe_color = ohc_color.fit_transform(trimmed_df['color'].values.reshape(-1,1)).toarray()
df_color = pd.DataFrame(ohe_color, columns = ohc_color.categories_[0])

# Apply one-hot encoder to engine type
ohc_engine = OneHotEncoder()
ohe_engine = ohc_engine.fit_transform(trimmed_df['engine_type'].values.reshape(-1,1)).toarray()
df_engine = pd.DataFrame(ohe_engine, columns = ohc_engine.categories_[0])

# Apply one-hot encoder to drive train 
ohc_drivetrain = OneHotEncoder()
ohe_drivetrain = ohc_drivetrain.fit_transform(trimmed_df['drivetrain'].values.reshape(-1,1)).toarray()
df_drivetrain = pd.DataFrame(ohe_drivetrain, columns = ohc_drivetrain.categories_[0])

# Apply one-hot encoder to state 
ohc_state = OneHotEncoder()
ohe_state = ohc_state.fit_transform(trimmed_df['state'].values.reshape(-1,1)).toarray()
df_state = pd.DataFrame(ohe_state, columns = ohc_state.categories_[0])

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# We will use Label Encoder for has_warranty since I presume having it has higher order
le_warranty = LabelEncoder()
has_warranty_df = trimmed_df[['has_warranty']].copy()
has_warranty_df['has_warranty'] = le_warranty.fit_transform(has_warranty_df['has_warranty']) 

# Finally we will extract the other columns 
numeric_df = trimmed_df[['odometer_value','year_produced','price_usd']].copy()

# Create a minmaxscaler to scale all the values
min_max_scaler = MinMaxScaler()
normalizable_cols = ['odometer_value','year_produced','price_usd']
numeric_df[normalizable_cols] = min_max_scaler.fit_transform(trimmed_df[normalizable_cols])

#print(min_max_scaler.transform((np.array([[27585858,2023,60897]]))))


In [None]:
# Concatenate all the dataframes
car_factors_features = pd.concat([df_manufacturer, df_transmission, df_color, df_engine, df_drivetrain, df_state,
                                 body_type_df, has_warranty_df, numeric_df], axis=1)
car_factors_features.head()

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

 # Get X & y values
X = car_factors_features.values 
y = trimmed_df['duration_listed'].values
    
# Obtain training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.1,  random_state=0)
#SVC(kernel='rbf')
model = LinearRegression()
pipe_lr = make_pipeline(model)
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
print(model.score(X_train, y_train))
mean_squared_error(y_test, y_pred)

In [None]:
import numpy as np

abc = trimmed_df['transmission'].values
print(abc)
carTransmissionTest = np.array(['automatic'])
print(carTransmissionTest)
test = ohc_transmission.transform(carTransmissionTest.reshape(-1,1)).toarray()
print(test)

In [1]:
from carsfactors import carsfactors

cf = carsfactors()

### Test Model first - Get stats

In [2]:
cf.model_stats()

'0.01878970628321386'

### Get Determination

In [3]:
cf.model_infer('Subaru','automatic', 'silver', 'gasoline', 'all', 'new', 'suv', True, 19000, 2018, 49000)

[[1. 0. 0.]]
[[0. 0. 1.]]
[[0]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.
  1. 0. 0. 0. 1. 0. 6. 0.]]


  return f(*args, **kwargs)


'[4183675.0234375]'

### Start up the service

In [None]:
!python carfactors_service.py

Try out the links 
* [stats](http://fillin)
* [determination](http://fillin)

### You must kill the kernel to try again for the port stays locked to the current kernel

# Summary
* Assignment and Model Results
* Techniques to improve the results

PUT YOUR ANSWERS HERE which could include additional cells with working code examples