In [2]:
import numpy as np
import pandas as pd

In [3]:
df_sellers = pd.read_csv("/content/olist_sellers_dataset.csv")
df_costumers = pd.read_csv("/content/olist_customers_dataset.csv")
df_orders = pd.read_csv("/content/olist_orders_dataset.csv")
df_order_items = pd.read_csv("/content/olist_order_items_dataset.csv")
df_order_payments = pd.read_csv("/content/olist_order_payments_dataset.csv")
df_order_reviews = pd.read_csv("/content/olist_order_reviews_dataset.csv")
df_products = pd.read_csv("/content/olist_products_dataset.csv")
df_geolocation = pd.read_csv("/content/olist_geolocation_dataset.csv")
df_translator = pd.read_csv("/content/product_category_name_translation.csv")

Function to get distance between 2 geolocations.

https://towardsdatascience.com/heres-how-to-calculate-distance-between-2-geolocations-in-python-93ecab5bbba4

*Calculates great-circle distance between two points on a sphere given their longitudes and latitudes using the haversine formula.*

In [4]:
def haversine_distance(lat1, lon1, lat2, lon2):
  r = 6371 # Planet earth radius in kilometres
  phi1 = np.radians(lat1)
  phi2 = np.radians(lat2)
  delta_phi = np.radians(lat2 - lat1)
  delta_lambda = np.radians(lon2 - lon1)
  a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
  res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))

  return np.round(res, 2)

*Extracts the geolocation coordinates from the sellers/costumers zip code digits using the olist_geolocation_dataset.csv file*



In [5]:
def zip_to_geo(data, zip_code):
  zip_rows = data.loc[data['geolocation_zip_code_prefix'] == zip_code].reset_index() # Might have multiple geolocation for each zip code
  lat = zip_rows['geolocation_lat'].mean() # Check this approach 
  lon = zip_rows['geolocation_lng'].mean()

  return lat, lon

*Calculates the distance between the customer and the seller*

In [6]:
def distance_between_customer_and_seller(customer_zip, seller_zip):
  return haversine_distance(*zip_to_geo(df_geolocation, customer_zip),*zip_to_geo(df_geolocation, seller_zip))

*Calculating estimated delivery time error*

In [7]:
# Selecting only delivered orders
df_delivery_time = df_orders.loc[df_orders['order_status'] == 'delivered'].copy()

# Dropping unnecessary columns to calculate delivery time
df_delivery_time = df_delivery_time.drop(['order_purchase_timestamp','order_delivered_carrier_date'], axis=1)

df_delivery_time.head()

Unnamed: 0,order_id,customer_id,order_status,order_approved_at,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 11:07:15,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-26 03:24:27,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:55:23,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:45:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 22:20:29,2018-02-16 18:17:02,2018-02-26 00:00:00


*Transforming String to Datetime and removing time from dates*

In [8]:
df_delivery_time['order_approved_at'] = pd.to_datetime(df_delivery_time['order_approved_at'], format = '%Y-%m-%d').dt.normalize() # dt.normalize set time as 00:00:00
df_delivery_time['order_delivered_customer_date'] = pd.to_datetime(df_delivery_time['order_delivered_customer_date'], format = '%Y-%m-%d').dt.normalize()
df_delivery_time['order_estimated_delivery_date'] = pd.to_datetime(df_delivery_time['order_estimated_delivery_date'], format = '%Y-%m-%d').dt.normalize()

*Calculating delivery_time and estimated_delivery_time in days*

In [9]:
df_delivery_time['delivery_time'] = df_delivery_time['order_delivered_customer_date'] - df_delivery_time['order_approved_at']
df_delivery_time['estimated_delivery_time'] = df_delivery_time['order_estimated_delivery_date'] - df_delivery_time['order_approved_at']

*Calculating time error between estimated and real delivery times*

In [10]:
df_delivery_time['estimated_time_error'] = (
    pd.to_numeric(df_delivery_time['delivery_time'].dt.days, downcast='float') 
  - pd.to_numeric(df_delivery_time['estimated_delivery_time'].dt.days, downcast='float')
)/pd.to_numeric(df_delivery_time['estimated_delivery_time'].dt.days, downcast='float')


df_delivery_time.head()

Unnamed: 0,order_id,customer_id,order_status,order_approved_at,order_delivered_customer_date,order_estimated_delivery_date,delivery_time,estimated_delivery_time,estimated_time_error
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02,2017-10-10,2017-10-18,8 days,16 days,-0.5
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-26,2018-08-07,2018-08-13,12 days,18 days,-0.333333
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08,2018-08-17,2018-09-04,9 days,27 days,-0.666667
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18,2017-12-02,2017-12-15,14 days,27 days,-0.481481
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13,2018-02-16,2018-02-26,3 days,13 days,-0.769231


*Merging df_customers with df_delivery_time*

In [11]:
df_delivery_time_1 = df_delivery_time.merge(df_costumers, left_on='customer_id', right_on='customer_id')
df_delivery_time_1 = df_delivery_time_1[['order_id', 'customer_id', 'estimated_delivery_time', 'delivery_time', 'estimated_time_error', 'customer_zip_code_prefix', 'customer_city', 'customer_state']]
df_delivery_time_1.head()

Unnamed: 0,order_id,customer_id,estimated_delivery_time,delivery_time,estimated_time_error,customer_zip_code_prefix,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,16 days,8 days,-0.5,3149,sao paulo,SP
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,18 days,12 days,-0.333333,47813,barreiras,BA
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,27 days,9 days,-0.666667,75265,vianopolis,GO
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,27 days,14 days,-0.481481,59296,sao goncalo do amarante,RN
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,13 days,3 days,-0.769231,9195,santo andre,SP


*Merging df_order_items with df_delivery_time_1*

In [12]:
df_delivery_time_2 = df_delivery_time_1.merge(df_order_items[['order_id', 'product_id', 'seller_id']], left_on='order_id', right_on='order_id')
df_delivery_time_2.head()

Unnamed: 0,order_id,customer_id,estimated_delivery_time,delivery_time,estimated_time_error,customer_zip_code_prefix,customer_city,customer_state,product_id,seller_id
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,16 days,8 days,-0.5,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,18 days,12 days,-0.333333,47813,barreiras,BA,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,27 days,9 days,-0.666667,75265,vianopolis,GO,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,27 days,14 days,-0.481481,59296,sao goncalo do amarante,RN,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,13 days,3 days,-0.769231,9195,santo andre,SP,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8


*Merging df_sellers with df_delivery_time_2* "**CHECK PRODUCT DIMENSIONS, SELLER SCORE, Location {ways to reach}**"

In [13]:
df_delivery_time_3 = df_delivery_time_2.merge(df_sellers[['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state']], left_on='seller_id', right_on='seller_id')
df_delivery_time_3.head()

Unnamed: 0,order_id,customer_id,estimated_delivery_time,delivery_time,estimated_time_error,customer_zip_code_prefix,customer_city,customer_state,product_id,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,16 days,8 days,-0.5,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,9350,maua,SP
1,8736140c61ea584cb4250074756d8f3b,ab8844663ae049fda8baf15fc928f47f,13 days,6 days,-0.538462,8577,itaquaquecetuba,SP,b00a32a0b42fd65efb58a5822009f629,3504c0cb71d7fa48d967e0e4c94d59d9,9350,maua,SP
2,a0151737f2f0c6c0a5fd69d45f66ceea,fc2697314ab7fbeda62bb6f1afa4efcd,13 days,6 days,-0.538462,13820,jaguariuna,SP,725cbfcaff95a4d43742fdf13cf43c75,3504c0cb71d7fa48d967e0e4c94d59d9,9350,maua,SP
3,a3bf941183211246f0d42ad757cba127,3718e1873d5dc3e8d96c0ab783278b02,10 days,3 days,-0.7,6298,osasco,SP,725cbfcaff95a4d43742fdf13cf43c75,3504c0cb71d7fa48d967e0e4c94d59d9,9350,maua,SP
4,1462290799412b71be32dd880eaf4e1b,220e4b027f0294fd79d2869ef67e7db6,24 days,7 days,-0.708333,94130,gravatai,RS,d7faab3fa0091d1220a8ada9cae1bab3,3504c0cb71d7fa48d967e0e4c94d59d9,9350,maua,SP


*Merging df_products with df_delivery_time_3*

In [14]:
df_delivery_time_4 = df_delivery_time_3.merge(df_products[['product_id', 'product_category_name']], left_on='product_id', right_on='product_id')
df_delivery_time_4.tail()

Unnamed: 0,order_id,customer_id,estimated_delivery_time,delivery_time,estimated_time_error,customer_zip_code_prefix,customer_city,customer_state,product_id,seller_id,seller_zip_code_prefix,seller_city,seller_state,product_category_name
110192,1ab38815794efa43d269d62b98dae815,a0b67404d84a70ef420a7f99ad6b190a,15 days,4 days,-0.733333,8528,ferraz de vasconcelos,SP,31ec3a565e06de4bdf9d2a511b822b4d,babcc0ab201e4c60188427cae51a5b8b,13660,porto ferreira,SP,construcao_ferramentas_iluminacao
110193,735dce2d574afe8eb87e80a3d6229c48,d531d01affc2c55769f6b9ed410d8d3c,23 days,9 days,-0.608696,60455,fortaleza,CE,1d187e8e7a30417fda31e85679d96f0f,d263fa444c1504a75cbca5cc465f592a,13478,americana,SP,moveis_decoracao
110194,ba4ff7bab012df64eed239c402ba6a0d,48ebd9467fef647e9a0cd9c1b9bdbe24,29 days,19 days,-0.344828,84130,palmeira,PR,dc2f80dfc6b4e57be59ec9dfe9140849,d6cd01c59123df02fc226eadbadb5f89,1207,sao paulo,SP,eletronicos
110195,25d2bfa43663a23586afd12f15b542e7,9d8c06734fde9823ace11a4b5929b5a7,17 days,21 days,0.235294,39803,teofilo otoni,MG,6e1c2008dea1929b9b6c27fa01381e90,edf3fabebcc20f7463cc9c53da932ea8,8320,sao paulo,SP,moveis_decoracao
110196,1565f22aa9452ff278638e87cc895678,56772dfbcbe7df908a284ff0d53adf7d,13 days,5 days,-0.615385,9687,sao bernardo do campo,SP,9c1e194db1d35a79d962ea610bfe0868,f3862c2188522d89860c38a3ea8b550d,14092,ribeirao preto,SP,perfumaria


*Calculating distance between customer and seller*

In [15]:
from tqdm.notebook import tqdm
tqdm.pandas()

df_delivery_time_4['distance_customer_seller'] = df_delivery_time_4.progress_apply(lambda row : distance_between_customer_and_seller(row['customer_zip_code_prefix'], row['seller_zip_code_prefix']), axis = 1)

  0%|          | 0/110197 [00:00<?, ?it/s]

*Creation of Training dataset*


In [16]:
df_training = df_delivery_time_4[[
 'customer_city',
 'customer_state',
 'seller_city',
 'seller_state',
 'product_category_name',  
 'distance_customer_seller',
 'estimated_delivery_time',
 'estimated_time_error',
]]
df_training.head()

Unnamed: 0,customer_city,customer_state,seller_city,seller_state,product_category_name,distance_customer_seller,estimated_delivery_time,estimated_time_error
0,sao paulo,SP,maua,SP,utilidades_domesticas,18.58,16 days,-0.5
1,sao paulo,SP,maua,SP,utilidades_domesticas,15.84,13 days,-0.769231
2,sao paulo,SP,maua,SP,utilidades_domesticas,23.28,13 days,-0.615385
3,florianopolis,SC,maua,SP,utilidades_domesticas,,19 days,-0.315789
4,itaquaquecetuba,SP,maua,SP,bebes,23.04,13 days,-0.538462


*Converting estimated_delivery_time from days to float*

In [17]:
df_training['estimated_delivery_time'] = pd.to_numeric(df_training['estimated_delivery_time'].dt.days, downcast='float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


*Removing delayed delivery as they can be outliers*

In [18]:
df_training.drop(df_training[df_training.estimated_time_error > 0].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


*Converting categorical values into numeric*

In [19]:
from sklearn import preprocessing

def encoding_categoricals(df=None):
  encoders = []
  col_names = df.columns
  for col in col_names:
    if type(df[col].iloc[0]) == str:
      le = preprocessing.LabelEncoder()
      le.fit(df[col].astype(str))
      encoders.append(le)
      df.loc[:,col] = le.transform(df[col].astype(str))
  return df, encoders

In [20]:
df_encoded, encoders = encoding_categoricals(df_training)
df_encoded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,customer_city,customer_state,seller_city,seller_state,product_category_name,distance_customer_seller,estimated_delivery_time,estimated_time_error
0,3467,25,321,21,73,18.58,16.0,-0.5
1,3467,25,321,21,73,15.84,13.0,-0.769231
2,3467,25,321,21,73,23.28,13.0,-0.615385
3,1318,23,321,21,73,,19.0,-0.315789
4,1769,25,321,21,9,23.04,13.0,-0.538462


*Removes NaN, infinite and negative infinite values from the dataset*

In [21]:
def clean_dataset(df):
  assert isinstance(df, pd.DataFrame), "df is not pd.DataFrame"

  df.dropna(inplace=True)
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
  return df[indices_to_keep].astype(np.float32)

In [22]:
df_encoded_clean = clean_dataset(df_encoded)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


*Normalizing the dataset*

In [23]:
def normalize_dataset(df=None):
  column_names = df.columns

  scaler = preprocessing.StandardScaler()
  X = scaler.fit_transform(df[column_names[:-1]])

  new_df = pd.DataFrame(data=X, columns=column_names[:-1])
  df = pd.concat([new_df, df['estimated_time_error']], axis=1)

  return df, scaler

In [24]:
df_training, scaler = normalize_dataset(df_encoded_clean)
df_training.head()

Unnamed: 0,customer_city,customer_state,seller_city,seller_state,product_category_name,distance_customer_seller,estimated_delivery_time,estimated_time_error
0,0.759132,0.427225,-0.424729,0.207644,1.56774,-1.015359,-0.400401,-0.5
1,0.759132,0.427225,-0.424729,0.207644,1.56774,-1.031297,-0.770455,-0.769231
2,0.759132,0.427225,-0.424729,0.207644,1.56774,-0.98802,-0.770455,-0.615385
3,-0.891037,0.427225,-0.424729,0.207644,-1.197317,-0.989416,-0.770455,
4,-1.946446,0.427225,-0.424729,0.207644,-1.197317,-0.497498,-0.770455,-0.538462


*Imputing for missing data*

In [25]:
from sklearn.impute import SimpleImputer

def fill_missing_data(df=None):
  imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  df.iloc[:,:] = imputer.fit_transform(df)

  return df

In [26]:
df_training_full = fill_missing_data(df_training)

*Splitting data into training and testing*

In [52]:
df_training_full.head()

Unnamed: 0,customer_city,customer_state,seller_city,seller_state,product_category_name,distance_customer_seller,estimated_delivery_time,estimated_time_error
0,0.759132,0.427225,-0.424729,0.207644,1.56774,-1.015359,-0.400401,-0.5
1,0.759132,0.427225,-0.424729,0.207644,1.56774,-1.031297,-0.770455,-0.769231
2,0.759132,0.427225,-0.424729,0.207644,1.56774,-0.98802,-0.770455,-0.615385
3,-0.891037,0.427225,-0.424729,0.207644,-1.197317,-0.989416,-0.770455,-0.603667
4,-1.946446,0.427225,-0.424729,0.207644,-1.197317,-0.497498,-0.770455,-0.538462


In [54]:
X = df_training_full.drop(columns = ['estimated_time_error'])
X.columns

Index(['customer_city', 'customer_state', 'seller_city', 'seller_state',
       'product_category_name', 'distance_customer_seller',
       'estimated_delivery_time'],
      dtype='object')

In [57]:
y = df_training_full['estimated_time_error']
y

0        -0.500000
1        -0.769231
2        -0.615385
3        -0.603667
4        -0.538462
            ...   
110182   -0.921569
110183   -0.200000
110189   -0.736842
110192   -0.733333
110196   -0.615385
Name: estimated_time_error, Length: 65274, dtype: float32

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Correlation between features and target**

In [60]:
focus_cols = ['estimated_time_error']
df_encoded_clean.corr().filter(focus_cols).drop(focus_cols).style.format('{:.2}')

Unnamed: 0,estimated_time_error
customer_city,-0.02
customer_state,-0.11
seller_city,-0.063
seller_state,0.0043
product_category_name,-0.0036
distance_customer_seller,0.18
estimated_delivery_time,-0.083


**What to classify**? 
- Customer
- Seller reliability
- Churn - Low , medium and high

***Modeling***

**Linear Regression**

In [61]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train, y_train)
lr_result = lr.predict(X_test)

In [62]:
print(lr_result)

[-0.6043161  -0.60374236 -0.60374236 ... -0.60374236 -0.60374236
 -0.60871667]


In [65]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score

print("mean_sqrd_error is=",mean_squared_error(y_test,lr_result))
print("root_mean_squared error of is=",np.sqrt(mean_squared_error(y_test,lr_result)))

mean_sqrd_error is= 0.030040225
root_mean_squared error of is= 0.17332116


**DecisionTreeRegressor**

In [66]:
from sklearn.tree import DecisionTreeRegressor

regr = DecisionTreeRegressor(max_depth=2)
regr.fit(X_train, y_train)
regr_result = regr.predict(X_test)

In [67]:
print(regr_result)

[-0.59864847 -0.60948513 -0.60948513 ... -0.60948513 -0.60948513
 -0.59864847]


In [68]:
regr.score(X_test, y_test)

0.0009976817517325376

In [69]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
score=r2_score(y_test,regr_result)
print("r2 score is",score)
print("mean_sqrd_error is=",mean_squared_error(y_test,regr_result))
print("root_mean_squared error of is=",np.sqrt(mean_squared_error(y_test,regr_result)))

r2 score is 0.0009976817517325376
mean_sqrd_error is= 0.03000933112982203
root_mean_squared error of is= 0.1732320153142081


**MLPRegressor**


In [70]:
from sklearn.neural_network import MLPRegressor

regr_MLP = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
regr_result = regr_MLP.predict(X_test)

print(regr_result)

[-0.59121186 -0.6054272  -0.6054272  ... -0.60542727 -0.60542727
 -0.5729977 ]


In [71]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score

print("mean_sqrd_error is=",mean_squared_error(y_test,regr_result))
print("root_mean_squared error of is=",np.sqrt(mean_squared_error(y_test,regr_result)))

mean_sqrd_error is= 0.03030608
root_mean_squared error of is= 0.17408642


**RandomForestRegressor**

In [75]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)
rf_result = rf.predict(X_test)

In [76]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score

print("mean_sqrd_error is=",mean_squared_error(y_test,rf_result))
print("root_mean_squared error of is=",np.sqrt(mean_squared_error(y_test,rf_result)))

mean_sqrd_error is= 0.031435925183007474
root_mean_squared error of is= 0.177301791257188


**KNNRegressor**

In [77]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train)

neigh_result = neigh.predict(X_test)

In [78]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score

print("mean_sqrd_error is=",mean_squared_error(y_test,neigh_result))
print("root_mean_squared error of is=",np.sqrt(mean_squared_error(y_test,neigh_result)))

mean_sqrd_error is= 0.044126656
root_mean_squared error of is= 0.21006346


**XBoost Regressor**

In [79]:
import xgboost
from sklearn import ensemble

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

xgb = ensemble.GradientBoostingRegressor(**params)
xgb.fit(X_train, y_train)

xgb_result = xgb.predict(X_test)

In [80]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score

print("mean_sqrd_error is=",mean_squared_error(y_test,xgb_result))
print("root_mean_squared error of is=",np.sqrt(mean_squared_error(y_test,xgb_result)))

mean_sqrd_error is= 0.029885904477133737
root_mean_squared error of is= 0.17287540159644962
