Random forest trees.
Random Forest is a classifier that contains a number of decision trees on various subsets of the given dataset and takes the average to improve the predictive accuracy of that dataset
It is based on the concept of ensemble learning, which is a process of combining multiple classifiers to solve a complex problem and to improve the performance of the model.
The greater number of trees in the forest leads to higher accuracy and prevents the problem of overfitting.


In [29]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [30]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import ExcelWriter
from pandas import ExcelFile

farmer = pd.read_excel('resources/datasets/data_farmer.xlsx', sheet_name='whole')

print("Column headings:")
print(farmer.columns)
farmer

Column headings:
Index(['Name', 'Gender', 'Age', 'Most common search food', 'Date Bought',
       'Total Price', 'System will recommed'],
      dtype='object')


Unnamed: 0,Name,Gender,Age,Most common search food,Date Bought,Total Price,System will recommed
0,William,M,16,Fruits,2021-03-01,200,Apple
1,John,M,17,Fruits,2021-03-01,150,Potatos
2,Jack,M,19,Vitamin C Fruit,2021-03-01,220,Cantaloupe
3,Mary,F,21,Vitamin C Fruit,2021-03-01,300,Grapefruit
4,Sam,F,22,Fruits,2021-03-01,250,Avocado
5,Alex,F,25,Fruits,2021-03-01,110,Watermelon
6,Nick,M,25,Fruits,2021-04-01,450,Watermelon
7,Sharon,F,29,Vegetables,2021-04-01,300,Ginger
8,Micheal,M,30,High carbohydrates food,2021-04-01,450,Sweet Potatoes
9,Randy,M,38,High carbohydrates food,2021-04-01,302,Oats


In [31]:
def getAnalysis(x):
  if (x == 'Fruits'):
    return '1'
  if (x == 'Vitamin C Fruit'):
    return '2'
  if (x == 'Vegetables'):
    return '3'
  if (x == 'High carbohydrates food'):
    return '4'
  else:
    return '5'

farmer['foodid'] = farmer['Most common search food'].apply(getAnalysis)
# test['rel'] = test['Label'].apply(getAnalysis)
farmer


Unnamed: 0,Name,Gender,Age,Most common search food,Date Bought,Total Price,System will recommed,foodid
0,William,M,16,Fruits,2021-03-01,200,Apple,1
1,John,M,17,Fruits,2021-03-01,150,Potatos,1
2,Jack,M,19,Vitamin C Fruit,2021-03-01,220,Cantaloupe,2
3,Mary,F,21,Vitamin C Fruit,2021-03-01,300,Grapefruit,2
4,Sam,F,22,Fruits,2021-03-01,250,Avocado,1
5,Alex,F,25,Fruits,2021-03-01,110,Watermelon,1
6,Nick,M,25,Fruits,2021-04-01,450,Watermelon,1
7,Sharon,F,29,Vegetables,2021-04-01,300,Ginger,3
8,Micheal,M,30,High carbohydrates food,2021-04-01,450,Sweet Potatoes,4
9,Randy,M,38,High carbohydrates food,2021-04-01,302,Oats,4


# Binning is a simple technique that groups different values into bins. we replace numerical features with categorical ones.



In [32]:
farmer.loc[(farmer.Age < 61), 'Agegroup'] = 1
farmer.loc[(farmer.Age < 50), 'Agegroup'] = 2
farmer.loc[(farmer.Age < 30), 'Agegroup'] = 3



In [33]:
farmer

Unnamed: 0,Name,Gender,Age,Most common search food,Date Bought,Total Price,System will recommed,foodid,Agegroup
0,William,M,16,Fruits,2021-03-01,200,Apple,1,3.0
1,John,M,17,Fruits,2021-03-01,150,Potatos,1,3.0
2,Jack,M,19,Vitamin C Fruit,2021-03-01,220,Cantaloupe,2,3.0
3,Mary,F,21,Vitamin C Fruit,2021-03-01,300,Grapefruit,2,3.0
4,Sam,F,22,Fruits,2021-03-01,250,Avocado,1,3.0
5,Alex,F,25,Fruits,2021-03-01,110,Watermelon,1,3.0
6,Nick,M,25,Fruits,2021-04-01,450,Watermelon,1,3.0
7,Sharon,F,29,Vegetables,2021-04-01,300,Ginger,3,3.0
8,Micheal,M,30,High carbohydrates food,2021-04-01,450,Sweet Potatoes,4,2.0
9,Randy,M,38,High carbohydrates food,2021-04-01,302,Oats,4,2.0


In [34]:
# checking for missing values
farmer.isnull().sum()

Name                       0
Gender                     0
Age                        0
Most common search food    0
Date Bought                0
Total Price                0
System will recommed       0
foodid                     0
Agegroup                   0
dtype: int64

In [35]:
# Total number of users
len(farmer) 

20

In [36]:
# descriptive analysis
farmer.describe()

Unnamed: 0,Age,Total Price,Agegroup
count,20.0,20.0,20.0
mean,36.65,224.25,2.15
std,14.375692,112.930288,0.812728
min,16.0,70.0,1.0
25%,24.25,140.0,1.75
50%,38.5,210.0,2.0
75%,48.5,300.0,3.0
max,60.0,450.0,3.0


In [37]:
x = farmer.drop(columns=['System will recommed','Total Price','Date Bought','Gender','Name','Most common search food','Age'])
output = farmer['System will recommed']

x.head()

Unnamed: 0,foodid,Agegroup
0,1,3.0
1,1,3.0
2,2,3.0
3,2,3.0
4,1,3.0


In [38]:
# building and predicting the model 
model = DecisionTreeClassifier()
model.fit(x,output)
predictions = model.predict([[3,2],[2,3]])
predictions

array(['Broccoli', 'Cantaloupe'], dtype=object)

In [48]:
model = DecisionTreeClassifier()
# splitting the data into testing 20% and training80%
x_train, x_test, output_train, output_test = train_test_split(x, output, test_size=0.2)
model.fit(x_train,output_train)
predictions = model.predict(x_test)
accuracy = accuracy_score(output_test, predictions)
accuracy

0.0

To fit it, we will import the RandomForestClassifier class from the sklearn.ensemble library.

In [40]:
# Splitting the dataset into training and test set.  
from sklearn.model_selection import train_test_split  
x_train, x_test, output_train, output_test= train_test_split(x, output, test_size= 0.25, random_state=0)  
  
#feature Scaling  
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()    
x_train= st_x.fit_transform(x_train)    
x_test= st_x.transform(x_test)   

n_estimators= The required number of trees in the Random Forest. The default value is 10. We can choose any number but need to take care of the overfitting issue.

criterion= It is a function to analyze the accuracy of the split. Here we have taken "entropy" for the information gain

In [41]:
#Fitting Decision Tree classifier to the training set  
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
classifier.fit(x_train, output_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
#Predicting the test set result  
output_pred= classifier.predict(x_test) 

In [43]:
output_pred

array(['Eggs', 'Watermelon', 'Sukuma wiki', 'Sweet Potatoes',
       'Sweet Potatoes'], dtype=object)

In [46]:
#Creating the Confusion matrix  
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(output_test, output_pred) 

In [47]:
cm

array([[0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])