# RANDOM FOREST

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
df = pd.read_csv('high_diamond_ranked_10min.csv')
df.head()


Unnamed: 0,gameId,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,4519157822,0,28,2,1,9,6,11,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,4523371949,0,12,1,0,5,5,5,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,4521474530,0,15,0,0,7,11,4,1,1,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,4524384067,0,43,1,0,4,5,5,1,0,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,4436033771,0,75,4,0,6,6,6,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# prepare the data
df_clean = df.copy()
X = df_clean.drop('blueWins', axis=1)
y = df['blueWins']
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# build the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# predict on test set
y_pred = rf.predict(X_test)

# evaluate the model
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)


Accuracy: 0.72165991902834


# Increasing Accuracy of the model 

### 1. Adjusting the number of trees and depth:
 Random Forest models perform better with more trees, up to a certain point. By default, the trees in a Random Forest can grow indefinitely. However, deeper trees can lead to overfitting.

In [41]:
max_depths = [5, 10, 15, 20]

for max_depth in max_depths:
    # build the model with the current max_depth
    rf = RandomForestClassifier(n_estimators=100, max_depth=max_depth, random_state=42)
    rf.fit(X_train, y_train)

    # predict on test set
    y_pred = rf.predict(X_test)

    # evaluate the model
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy with max_depth={max_depth}: {acc}")


Accuracy with max_depth=5: 0.7206477732793523
Accuracy with max_depth=10: 0.72165991902834
Accuracy with max_depth=15: 0.7206477732793523
Accuracy with max_depth=20: 0.7140688259109311


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

n_estimators = [50, 100, 150, 200]

for n in n_estimators:
    # build the model with the current n_estimators
    rf = RandomForestClassifier(n_estimators=n, max_depth=5, random_state=42)
    rf.fit(X_train, y_train)

    # predict on test set
    y_pred = rf.predict(X_test)

    # evaluate the model
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy with n_estimators={n}: {acc}")

Accuracy with n_estimators=50: 0.72165991902834
Accuracy with n_estimators=100: 0.7206477732793523
Accuracy with n_estimators=150: 0.7191295546558705
Accuracy with n_estimators=200: 0.7206477732793523


### 2. Feature Selection -

a) Feature Importance: Random Forest models come with a built-in feature importance measure, which ranks the importance of each feature based on how much it contributes to the overall performance of the model. You can use this measure to select the top K most important features and exclude the rest.


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
# build the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# select features based on importance
selector = SelectFromModel(rf, threshold='mean', prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# build a new model with selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# predict on test set
y_pred = rf_selected.predict(X_test_selected)

# evaluate the model
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.7110323886639676


b) SelectKBest Feature Selection -

we use the SelectKBest class from scikit-learn to select the top K features based on the F-score, a statistical test that measures the linear relationship between each feature and the target variable.

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

# select features based on univariate test
selector = SelectKBest(score_func=f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# build a new model with selected features
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(X_train_selected, y_train)

# predict on test set
y_pred = rf_selected.predict(X_test_selected)

# evaluate the model
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.6928137651821862


### 3. Recursive Feature Elimination (RFE)

In [31]:
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score



sel_rf=SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=1))

sel_rf.fit(X_train, y_train)
sel_rfe=RFE(RandomForestClassifier(n_estimators=100, random_state=1),n_features_to_select=20)
sel_rfe.fit(X_train, y_train)

In [35]:
def classifier_model(X_train, X_test, y_train, y_test, method, data):
    rf_clf=RandomForestClassifier(n_estimators=1000, random_state=1)
    rf_clf.fit(X_train, y_train)
    y_pred_rf=rf_clf.predict(X_test)
    score_rlf=accuracy_score(y_test, y_pred_rf)
    print("---Feature Selection method: {}---". format(method))
    print("---Checking Accuracy with {}---".format(data))
    print("The accuracy score of Random Forest:", score_rlf)
    

In [36]:
# Total features selected:
sel_rfe.get_support().sum()
#### Let's transform the data now;
X_train_rfe=sel_rfe.transform(X_train)
X_test_rfe=sel_rfe.transform(X_test)
classifier_model(X_train_rfe, X_test_rfe, y_train, y_test, "Recursive feature extraction with RF", "Reduced Features")

---Feature Selection method: Recursive feature extraction with RF---
---Checking Accuracy with Reduced Features---
The accuracy score of Random Forest: 0.715080971659919


### 4. Ensemble Methods using Voting Classifier:

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# define individual models
rf = RandomForestClassifier(random_state=42)
svc = SVC(probability=True, random_state=42)
lr = LogisticRegression(random_state=42)

# define the ensemble model using voting classifier
ensemble_model = VotingClassifier(
    estimators=[('rf', rf), ('svc', svc), ('lr', lr)], voting='soft')

# train the ensemble model
ensemble_model.fit(X_train, y_train)

# evaluate the ensemble model
ensemble_model.score(X_test, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7272267206477733

### 5. Feature Engineering using PCA:

In [39]:
from sklearn.decomposition import PCA

# perform PCA to reduce the dimensionality of the data
pca = PCA(n_components=10, random_state=42)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# build the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# predict on test set
y_pred = rf.predict(X_test)

# evaluate the model
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)


Accuracy: 0.72165991902834
