In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn import grid_search
import matplotlib.pyplot as plt

In [2]:
# Load data files
vehicle = pd.read_csv('vehicles.csv')
reservation = pd.read_csv('reservations.csv')

In [3]:
# Roll up the reservationdata so that we have one row for each vehicle
reserv_agg = reservation.groupby('vehicle_id')

In [4]:
# Change the format of the rolled up data from "Group by" object to panda data frame
reserv_agg = pd.DataFrame(reserv_agg.size().reset_index(name="count"))

In [5]:
# Merge vehcile data with reservation data
merge = vehicle.merge(reserv_agg,on='vehicle_id', how='left')

In [6]:
# Assuming all vehicle with missing reservation data as they were not booked.
# So imputing zero
merge['count'].fillna(0, inplace=True)

In [15]:
merge.head()

Unnamed: 0,vehicle_id,technology,actual_price,recommended_price,num_images,street_parked,description,count
0,1,1,67.85,59,5,0,7,1.0
1,2,0,100.7,53,5,0,224,7.0
2,3,0,74.0,74,4,1,21,17.0
3,4,0,135.0,75,1,0,184,2.0
4,5,0,59.36,53,2,1,31,2.0


In [6]:
# This is just to write into excel
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('pandas_.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
merge.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

# Variable Importance
# Approach 1 - Random Forest Regressor

### FInd the best parameter with cross validation

In [7]:
model = RandomForestRegressor()

In [8]:
# Take a list of parameters to choose from
param_grid = {
                 'n_estimators': [5, 10, 15, 20],
                 'max_depth': [2, 5, 7, 9]
             }

In [9]:
# Using Grid search with cross validation for optimization
grid_reg = grid_search.GridSearchCV(model,param_grid, cv=10)

In [10]:
# Fit the model
grid_reg.fit(merge[merge.columns[1:7]], merge[[7]])

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 20], 'max_depth': [2, 5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [11]:
# Get the best estimator
grid_reg.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

## Now fit the best model

In [32]:
model_new = RandomForestRegressor(max_depth=5,min_samples_leaf=1,
           min_samples_split=2,n_estimators=10)

In [33]:
model_new.fit(merge[merge.columns[1:7]], merge[[7]])

  if __name__ == '__main__':


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [34]:
# Get feature importance ranking from the model
importances = model_new.feature_importances_
indices = np.argsort(importances)

In [35]:
# Make a data frame of feature and their importance
feat_imp = pd.DataFrame({'feature': merge.columns[1:7], 'importance': np.round(importances,3)})

In [36]:
Importance = feat_imp.sort_values('importance', ascending=False)
print Importance

             feature  importance
1       actual_price       0.380
2  recommended_price       0.321
3         num_images       0.167
5        description       0.098
4      street_parked       0.018
0         technology       0.017


In [37]:
# Plot them into a bar graph
plt.figure()
plt.title('Feature Importances')
plt.barh(range(len(indices)),importances[indices], align='center', color='g')
plt.yticks(range(len(indices)),merge[indices+1])
plt.tight_layout()
plt.show()

# Approach 2 - Univariate Selection

In [38]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [39]:
fit = SelectKBest(score_func=chi2, k=4).fit(merge[merge.columns[1:7]], merge[[7]])

  y = column_or_1d(y, warn=True)


In [40]:
# summarize scores
np.set_printoptions(precision=3)
importances = fit.scores_
indices = np.argsort(importances)

In [41]:
# Make a data frame of feature and their importance
feat_imp = pd.DataFrame({'feature': merge.columns[1:7], 'importance': np.round(importances,3)})
Importance = feat_imp.sort_values('importance', ascending=False)
print Importance

             feature  importance
5        description    1455.093
1       actual_price     935.136
2  recommended_price      72.371
3         num_images      67.643
0         technology      46.984
4      street_parked      13.096


In [44]:
plt.figure()
plt.title('Feature Importances')
plt.barh(range(len(indices)),importances[indices], align='center', color='g')
plt.yticks(range(len(indices)),merge[indices+1])
plt.tight_layout()
plt.show()

# Approach 3 - Recursive Feature Elimination

In [83]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


In [86]:
# Train logistic regression model
model = LogisticRegression()
rfe = RFE(model, 5)
fit = rfe.fit(merge[merge.columns[1:7]], merge[[7]])


  y = column_or_1d(y, warn=True)


In [87]:
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num Features: 5
Selected Features: [ True  True False  True  True  True]
Feature Ranking: [1 1 2 1 1 1]
