# Read in Data and Preprocess

In [None]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.datasets import make_classification

In [None]:
# Import our input dataset
df = pd.read_csv('Resources/.csv')
df.head()

In [None]:
# Preprocess

In [None]:
# Encode data
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

# or

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(df.Country.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names([''])
encode_df.head()

# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(cat,1)
df.head()

# Split the Data into Training and Testing

In [None]:
# Create target and features

# Define the features set.
X = df.copy()
X = X.drop("", axis=1)
X.head()

# Define the target set.
y = df[""]
y[:5]

In [None]:
# Train the model

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Model

In [None]:
# Scale the model

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

or

brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)
brf_model = brf_model.fit(X_train_scaled, y_train)
brf_model

In [None]:
# Calculate the accuracy score

# Calculated the balanced accuracy score
predictions = brf_model.predict(X_test_scaled)
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

In [None]:
# Print the classification report
print(classification_report_imbalanced(y_test, predictions))

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)