In [40]:
# Importing Libraries 
import os, sys
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [87]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [88]:
train , test = train_test_split(train_df,test_size=0.2,random_state=42)

In [91]:
train

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
549,FDW44,9.500,Regular,0.035206,Fruits and Vegetables,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1,2386.2272
7757,NCF54,18.000,Low Fat,0.047473,Household,170.5422,OUT045,2002,,Tier 2,Supermarket Type1,3103.9596
764,FDY03,17.600,Regular,0.076122,Meat,111.7202,OUT046,1997,Small,Tier 1,Supermarket Type1,1125.2020
6867,FDQ20,8.325,Low Fat,0.029845,Fruits and Vegetables,41.6138,OUT045,2002,,Tier 2,Supermarket Type1,284.2966
2716,FDP34,12.850,Low Fat,0.137228,Snack Foods,155.5630,OUT046,1997,Small,Tier 1,Supermarket Type1,4224.5010
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,FDY08,9.395,Regular,0.286345,Fruits and Vegetables,139.1838,OUT010,1998,,Tier 3,Grocery Store,280.9676
5191,FDC41,15.600,Low Fat,0.117575,Frozen Foods,75.6670,OUT017,2007,,Tier 2,Supermarket Type1,1301.6390
5390,NCQ53,17.600,Low Fat,0.018944,Health and Hygiene,237.3590,OUT045,2002,,Tier 2,Supermarket Type1,6145.3340
860,FDL46,20.350,low fat,0.054363,Snack Foods,117.9466,OUT017,2007,,Tier 2,Supermarket Type1,1649.8524


In [42]:
target_column_name = 'Item_Outlet_Sales'
input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = test_df[target_column_name]

In [43]:
# function to get numerical and categorical columns 
def Numerical_categorical_column(dataframe):
    '''
    This function returns the numerical and categorical column 
    return numerical_columns,categorical_columns
    '''
    numerical_columns = [i for i in dataframe.columns if  dataframe[i].dtype != 'O']
    categorical_columns = [i for i in dataframe.columns if  dataframe[i].dtype == 'O']
    return numerical_columns,categorical_columns

numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)
print(f'numerical_features : {numerical_features}')
print(f'categorical_features : {categorical_features}')

numerical_features : ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
categorical_features : ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [44]:
# Replacing same values with different names 
input_feature_train_df['Item_Fat_Content'] = input_feature_train_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])
input_feature_test_df['Item_Fat_Content'] = input_feature_test_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])

In [45]:
#Removing unwanted data from name item_identifier
input_feature_train_df['Item_Identifier'] = input_feature_train_df['Item_Identifier'].apply(lambda x:x[:2])
input_feature_test_df['Item_Identifier'] = input_feature_test_df['Item_Identifier'].apply(lambda x:x[:2])


In [46]:
# Feature Engineering of Outlet Establishment year
input_feature_train_df['Outlet_age'] = 2013 - input_feature_train_df['Outlet_Establishment_Year']
input_feature_train_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

input_feature_test_df['Outlet_age'] = 2013 - test_df['Outlet_Establishment_Year']
input_feature_test_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

In [47]:
# # Those product who are non-consumbale but have fat content will replace them with non-ediable fat content

input_feature_train_df.loc[input_feature_train_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'
input_feature_test_df.loc[input_feature_test_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'

In [48]:
# dropping unwanted columns will work on it after some time 
input_feature_train_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)
input_feature_test_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)

In [55]:
numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)
numerical_features

['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_age']

In [56]:
categorical_features

['Item_Identifier',
 'Item_Fat_Content',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [52]:
input_feature_test_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_age
0,FD,20.750,Low Fat,0.007565,107.8622,Medium,Tier 1,Supermarket Type1,14
1,FD,8.300,Regular,0.038428,87.3198,,Tier 2,Supermarket Type1,6
2,NC,14.600,Non Edible,0.099575,241.7538,,Tier 3,Grocery Store,15
3,FD,7.315,Low Fat,0.015388,155.0340,,Tier 2,Supermarket Type1,6
4,FD,,Regular,0.118599,234.2300,Medium,Tier 3,Supermarket Type3,28
...,...,...,...,...,...,...,...,...,...
5676,FD,10.500,Regular,0.013496,141.3154,Small,Tier 1,Supermarket Type1,16
5677,FD,7.600,Regular,0.142991,169.1448,Medium,Tier 3,Supermarket Type2,4
5678,NC,10.000,Non Edible,0.073529,118.7440,,Tier 2,Supermarket Type1,11
5679,FD,15.300,Regular,0.000000,214.6218,,Tier 2,Supermarket Type1,6


In [26]:
train_num_df = input_feature_train_df[numerical_features]
train_cat_df = input_feature_train_df[categorical_features]
train_cat_df = train_cat_df.apply(LabelEncoder().fit_transform)
train_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

test_num_df = input_feature_test_df[numerical_features]
test_cat_df = input_feature_test_df[categorical_features]
test_cat_df = test_cat_df .apply(LabelEncoder().fit_transform)
test_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

In [27]:
train_num_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age
0,9.300,0.016047,249.8092,14
1,5.920,0.019278,48.2692,4
2,17.500,0.016760,141.6180,14
3,19.200,0.000000,182.0950,15
4,8.930,0.000000,53.8614,26
...,...,...,...,...
8518,6.865,0.056783,214.5218,26
8519,8.380,0.046982,108.1570,11
8520,10.600,0.035186,85.1224,9
8521,7.210,0.145221,103.1332,4


# Handling Missing Values 

# Numerical Features 

In [29]:
imputer_num=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
new_array=imputer_num.fit_transform(train_num_df) # impute the missing values
train_num_df = pd.DataFrame(data=new_array,columns=train_num_df.columns)


scaler = StandardScaler()
num_array = scaler.fit_transform(train_num_df)
train_num_df = pd.DataFrame(num_array,columns=train_num_df.columns)

train_num_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age
0,-0.818533,-0.970732,1.747454,-0.139541
1,-1.591194,-0.908111,-1.489023,-1.334103
2,1.055970,-0.956917,0.010040,-0.139541
3,1.444587,-1.281758,0.660050,-0.020085
4,-0.903114,-1.281758,-1.399220,1.293934
...,...,...,...,...
8518,-1.375169,-0.181193,1.180783,1.293934
8519,-1.028843,-0.371154,-0.527301,-0.497909
8520,-0.521356,-0.599784,-0.897208,-0.736822
8521,-1.296303,1.532880,-0.607977,-1.334103


In [30]:
imputer_cat=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
new_array=imputer_cat.fit_transform(train_cat_df) # impute the missing values
train_cat_df = pd.DataFrame(data=new_array,columns=train_cat_df.columns)



scaler = StandardScaler()
cat_array = scaler.fit_transform(train_cat_df)
train_cat_df = pd.DataFrame(cat_array,columns=train_cat_df.columns)
train_cat_df

Unnamed: 0,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
1,-2.095286,1.236942,-0.429154,1.091569,1.002972
2,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
3,-0.179795,1.236942,-1.756324,1.091569,-1.508289
4,1.735696,0.119565,-1.756324,1.091569,-0.252658
...,...,...,...,...,...
8518,-0.179795,-0.997813,-1.756324,1.091569,-0.252658
8519,-0.179795,1.236942,0.898016,-0.138882,-0.252658
8520,1.735696,0.119565,0.898016,-0.138882,-0.252658
8521,-0.179795,1.236942,-0.429154,1.091569,1.002972


In [33]:
train_df = pd.concat([train_num_df,train_cat_df],axis=1)

In [None]:
import joblib
save_path = r'D:\Projects_new\deleteFolder'
os.getcwd()

os.chdir(save_path)

joblib.dump(num_pipeline,'num_pipeline.pkl')

In [None]:
# Piepline for Numerical Columns 
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])

In [None]:
os.chdir(save_path)
num_pipeline = joblib.load('num_pipeline.pkl')
num_array = num_pipeline.transform(num_df)
num_df = pd.DataFrame(num_array,columns=new_data.columns)
num_df

# Categorical Features 

In [None]:
cat_df = train_df[categorical_features]
cat_df

In [None]:
cat_df = cat_df.apply(LabelEncoder().fit_transform)

In [None]:
# While encoding Nan values of Outlet_size got encoded with 3 so we are replacing it 
cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

imputer_cat=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
new_array=imputer_cat.fit_transform(cat_df) # impute the missing values
cat_df = pd.DataFrame(data=new_array,columns=cat_df.columns)


scaler = StandardScaler()
cat_array = scaler.fit_transform(cat_df)
cat_df = pd.DataFrame(cat_array,columns=cat_df.columns)
cat_df

In [None]:

cat_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])

In [None]:
cat_array = cat_pipeline.fit_transform(cat_df)
cat_df = pd.DataFrame(cat_array,columns=cat_df.columns)
cat_df

# Creating Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_features),
                ('cat_pipeline', cat_pipeline, categorical_features),
            ])

In [None]:
num_df = input_feature_train_df[numerical_features]
cat_df = input_feature_train_df[categorical_features]
cat_df = cat_df.apply(LabelEncoder().fit_transform)
cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

In [None]:
input_feature_train_df = pd.concat([num_df,cat_df],axis=1)
input_feature_train_df

In [None]:
array = preprocessing.fit_transform(input_feature_train_df)
df = pd.DataFrame(array,columns=input_feature_train_df.columns)
df

In [58]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

target_column_name = 'Item_Outlet_Sales'
input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = test_df[target_column_name]


# function to get numerical and categorical columns 
def Numerical_categorical_column(dataframe):
    '''
    This function returns the numerical and categorical column 
    return numerical_columns,categorical_columns
    '''
    numerical_columns = [i for i in dataframe.columns if  dataframe[i].dtype != 'O']
    categorical_columns = [i for i in dataframe.columns if  dataframe[i].dtype == 'O']
    return numerical_columns,categorical_columns



#Replacing same values with different names 
input_feature_train_df['Item_Fat_Content'] = input_feature_train_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])
input_feature_test_df['Item_Fat_Content'] = input_feature_test_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])

#Removing unwanted data from name item_identifier
input_feature_train_df['Item_Identifier'] = input_feature_train_df['Item_Identifier'].apply(lambda x:x[:2])
input_feature_test_df['Item_Identifier'] = input_feature_test_df['Item_Identifier'].apply(lambda x:x[:2])

# Feature Engineering of Outlet Establishment year
input_feature_train_df['Outlet_age'] = 2013 - input_feature_train_df['Outlet_Establishment_Year']
input_feature_train_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

input_feature_test_df['Outlet_age'] = 2013 - test_df['Outlet_Establishment_Year']
input_feature_test_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

#Those product who are non-consumbale but have fat content will replace them with non-ediable fat content

input_feature_train_df.loc[input_feature_train_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'
input_feature_test_df.loc[input_feature_test_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'


# dropping unwanted columns will work on it after some time 
input_feature_train_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)
input_feature_test_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)

numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)


# Piepline for Numerical Columns 
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])


cat_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])



from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_features),
                ('cat_pipeline', cat_pipeline, categorical_features),
            ])

train_num_df = input_feature_train_df[numerical_features]
train_cat_df = input_feature_train_df[categorical_features]
train_cat_df = train_cat_df.apply(LabelEncoder().fit_transform)
train_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

test_num_df = input_feature_test_df[numerical_features]
test_cat_df = input_feature_test_df[categorical_features]
test_cat_df = test_cat_df .apply(LabelEncoder().fit_transform)
test_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

input_feature_train_df = pd.concat([train_num_df,train_cat_df],axis=1)
input_feature_test_df = pd.concat([test_num_df,test_cat_df],axis=1)


train_array = preprocessing.fit_transform(input_feature_train_df)
input_feature_train_df = pd.DataFrame(train_array,columns=input_feature_train_df.columns)
input_feature_train_df



test_array = preprocessing.transform(input_feature_test_df)
input_feature_test_df = pd.DataFrame(test_array,columns=input_feature_test_df.columns)
input_feature_test_df


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1.798914,-1.135138,-0.532035,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
1,-1.047131,-0.536960,-0.861920,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658
2,0.393036,0.648183,1.618094,-0.020085,1.735696,0.119565,-1.756324,1.091569,-1.508289
3,-1.272300,-0.983503,0.225484,-1.095190,-0.179795,-0.997813,0.898016,-0.138882,-0.252658
4,0.186917,1.016910,1.497272,1.532846,-0.179795,1.236942,-0.429154,1.091569,2.258603
...,...,...,...,...,...,...,...,...,...
5676,-0.544215,-1.020172,0.005181,0.099372,-0.179795,1.236942,0.898016,-1.369334,-0.252658
5677,-1.207149,1.489663,0.452086,-1.334103,-0.179795,1.236942,-0.429154,1.091569,1.002972
5678,-0.658514,0.143358,-0.357287,-0.497909,1.735696,0.119565,0.898016,-0.138882,-0.252658
5679,0.553055,-1.281758,1.182389,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658


In [59]:
input_feature_train_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,-0.818533,-0.970732,1.747454,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
1,-1.591194,-0.908111,-1.489023,-1.334103,-2.095286,1.236942,-0.429154,1.091569,1.002972
2,1.055970,-0.956917,0.010040,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
3,1.444587,-1.281758,0.660050,-0.020085,-0.179795,1.236942,-1.756324,1.091569,-1.508289
4,-0.903114,-1.281758,-1.399220,1.293934,1.735696,0.119565,-1.756324,1.091569,-0.252658
...,...,...,...,...,...,...,...,...,...
8518,-1.375169,-0.181193,1.180783,1.293934,-0.179795,-0.997813,-1.756324,1.091569,-0.252658
8519,-1.028843,-0.371154,-0.527301,-0.497909,-0.179795,1.236942,0.898016,-0.138882,-0.252658
8520,-0.521356,-0.599784,-0.897208,-0.736822,1.735696,0.119565,0.898016,-0.138882,-0.252658
8521,-1.296303,1.532880,-0.607977,-1.334103,-0.179795,1.236942,-0.429154,1.091569,1.002972


In [62]:
test_array  = pd.read_csv(r"D:\Projects_new\Stores_Sales_Prediction\sales\artifact\data_transformation\2022-07-25-10-18-46\preprocessed_files\test_transformed\test_arry_df.csv")
train_array = pd.read_csv(r"D:\Projects_new\Stores_Sales_Prediction\sales\artifact\data_transformation\2022-07-25-10-18-46\preprocessed_files\train_transformed\train_array_df.csv")

In [63]:
train_array


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,-0.818533,-0.970732,1.747454,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658,3735.1380
1,-1.591194,-0.908111,-1.489023,-1.334103,-2.095286,1.236942,-0.429154,1.091569,1.002972,443.4228
2,1.055970,-0.956917,0.010040,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658,2097.2700
3,1.444587,-1.281758,0.660050,-0.020085,-0.179795,1.236942,-1.756324,1.091569,-1.508289,732.3800
4,-0.903114,-1.281758,-1.399220,1.293934,1.735696,0.119565,-1.756324,1.091569,-0.252658,994.7052
...,...,...,...,...,...,...,...,...,...,...
8518,-1.375169,-0.181193,1.180783,1.293934,-0.179795,-0.997813,-1.756324,1.091569,-0.252658,2778.3834
8519,-1.028843,-0.371154,-0.527301,-0.497909,-0.179795,1.236942,0.898016,-0.138882,-0.252658,549.2850
8520,-0.521356,-0.599784,-0.897208,-0.736822,1.735696,0.119565,0.898016,-0.138882,-0.252658,1193.1136
8521,-1.296303,1.532880,-0.607977,-1.334103,-0.179795,1.236942,-0.429154,1.091569,1.002972,1845.5976


In [64]:
test_array

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,1.798914,-1.135138,-0.532035,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658,0.0
1,-1.047131,-0.536960,-0.861920,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658,0.0
2,0.393036,0.648183,1.618094,-0.020085,1.735696,0.119565,-1.756324,1.091569,-1.508289,0.0
3,-1.272300,-0.983503,0.225484,-1.095190,-0.179795,-0.997813,0.898016,-0.138882,-0.252658,0.0
4,0.186917,1.016910,1.497272,1.532846,-0.179795,1.236942,-0.429154,1.091569,2.258603,0.0
...,...,...,...,...,...,...,...,...,...,...
5676,-0.544215,-1.020172,0.005181,0.099372,-0.179795,1.236942,0.898016,-1.369334,-0.252658,0.0
5677,-1.207149,1.489663,0.452086,-1.334103,-0.179795,1.236942,-0.429154,1.091569,1.002972,0.0
5678,-0.658514,0.143358,-0.357287,-0.497909,1.735696,0.119565,0.898016,-0.138882,-0.252658,0.0
5679,0.553055,-1.281758,1.182389,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658,0.0


In [82]:
X = train_array.drop(columns=['Item_Outlet_Sales'])
y = train_array['Item_Outlet_Sales']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [71]:
X_test = test_array.drop(columns=['Item_Outlet_Sales'])
y_test = test_array['Item_Outlet_Sales']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
5676    0.0
5677    0.0
5678    0.0
5679    0.0
5680    0.0
Name: Item_Outlet_Sales, Length: 5681, dtype: float64

In [83]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
model=RandomForestRegressor(criterion='absolute_error', max_depth = 7, max_features = 0.79, max_samples = 0.7, n_estimators = 75)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
r2=r2_score(y_test, y_pred)
r2

print(f'r2 score : {r2}')


mse = mean_squared_error(y_test, y_pred)
print("RMSE: %.2f" % (mse**(1/2.0)))

r2 score : 0.6130266889518863
RMSE: 1025.56


In [86]:
from sklearn.model_selection import GridSearchCV
pramrf = {'n_estimators':range(10,100,5),
        'criterion':["squared_error", "absolute_error"],
       'max_depth':range(3,10,1),
        'max_features':[i/100.0 for i in range(70,100,3)],
        'max_samples':[i/100.0 for i in range(70,100,5)]
       }

gridrf  = GridSearchCV(RandomForestRegressor(), pramrf, cv=5, n_jobs=-1)
gridrf.fit(X_train,y_train)
gridrf.best_params_

KeyboardInterrupt: 

In [100]:
xgb_r2 ,base_accuracy,rf_r2 = 30 ,50,10


if xgb_r2 > base_accuracy and xgb_r2>rf_r2:
            best_model,model_name = 'xgb_model' ,'XGB'
            print(f'best Model is {model_name} with parameters {best_model} ')
elif rf_r2 > base_accuracy and rf_r2>xgb_r2:
            best_model,model_name = 'rf_model','RandomForest'
            print(f'best Model is {model_name} with parameters {best_model} ')
else:
            print( f"None of model has base accuracy more than {base_accuracy}") 

None of model has base accuracy more than 50
