# Prediciting Post Reaction using Random Forest

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Scikit Learn

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Load Dataset

In [3]:
df_train = pd.read_csv('postSenti_train.tsv', index_col='Unnamed: 0', sep='\t')
df_test = pd.read_csv('postSenti_test.tsv', index_col='Unnamed: 0', sep='\t') 

In [4]:
# Separate features and target for training dataset
X_train = df_train.drop(['sentiment_class'], axis=1)
y_train = df_train['sentiment_class']

In [5]:
# Separate features and target for testing dataset
X_test = df_test.drop(['sentiment_class'], axis=1)
y_test = df_test['sentiment_class']

## Random Forest Classification

### Feature selection using RandomForestClassifier--feature_importances

In [6]:
# Instantiate RandomForestClassifier
rf = RandomForestClassifier(random_state=35).fit(X_train, y_train)
rf_feat_imp = rf.feature_importances_
# Feature importance
feature_importance = pd.DataFrame(rf_feat_imp, index=X_train.columns, columns=["Feat_Imp"])

In [7]:
#Sort in descending order to get Most Important to Least Important
feature_importance.sort_values("Feat_Imp",ascending=False, inplace=True)
feature_importance.reset_index(inplace=True)

In [8]:
# Top 5 Feature Importance
top5_feature = feature_importance.iloc[:5]
predictor = list(top5_feature['index'])
predictor

['Topic 0', 'hour_day', 'Topic 1', 'Topic 2', 'Topic 7']

### Hyperparameter Optimization using GridSearch

In [9]:
param_grid = {'min_samples_split':[2,5,10],
              'criterion':['gini','entropy'],
              'min_samples_leaf':[1,2,4],
              'n_estimators':[200,400,600,800,1000],
              'max_depth':[20,40,60,80,100,None]}

In [10]:
# Instantiate Random Forest Classifier
clf = RandomForestClassifier()
# Instantiate RandomizedSearchCV
grid_search = RandomizedSearchCV(clf, param_distributions=param_grid, cv=5, random_state=0)

In [11]:
# Fit X_train
grid_search.fit(X_train[predictor], y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [12]:
print("Best estimator:")
print(grid_search.best_estimator_)
print("Best parameters:")
print(grid_search.best_params_)
print("Best score:")
print(grid_search.best_score_)

Best estimator:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=80, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Best parameters:
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 80, 'criterion': 'entropy'}
Best score:
0.5441741357234315


### Build Random Forest Model

In [13]:
# Instantiate SVC
rf_clf = RandomForestClassifier(n_estimators=400,criterion='entropy',min_samples_leaf=4,min_samples_split=10, max_depth=80)
# Fit the training dataset on the SVC
rf_clf.fit(X_train[predictor], y_train)
# Predict the labels on validation dataset
y_pred = rf_clf.predict(X_test[predictor])

### Evaluate Random Forest Model

In [14]:
# Get Metrics
accuracy = accuracy_score(y_pred, y_test)*100
# Store metrics in dictionary
score_dict = {'Random Forest': pd.Series([accuracy], index=['AccuracyScore'])}
# Transform dictionary to df
df_metrics = pd.DataFrame(score_dict)
df_metrics

Unnamed: 0,Random Forest
AccuracyScore,62.755102


## Cross Validation

In [15]:
print('Mean Cross validated score -> ', np.mean(cross_val_score(rf_clf, X_train[predictor], y_train, cv=5)*100))

Mean Cross validated score ->  54.41050769021369
