In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
import dill 

df = dill.load(open('df_sentiments.pkd', 'rb'))

### <font color='green'> Machine Learning Model </font> 


In this notebook, [Scikit Learn Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) is tunned to predict the lease status of each channel, using the following features: 

  - User role (applicant or resident),
  - Response time,
  - Conversation length,
  - Message length,
  - Average sentiment score for a text,
  - Frequency of sentiment score of all sentences within each message in 5 classes: very negative, negative, neutral, positive, very positive. 
    
The data set include a total of 180K observations and 10 features. The data set is divided into train and test (90%:10%) sets manually in order to assure that messages corresponding to the same channel are kept together. The categorical featuers (the user status) are preprocessed and transformed via [One Hot Encoder algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html). 

The hyperparameters are first narrowed down using [Randomized Search Cross Validation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) and further optimized via [Grid Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) both provided in the python Scikit Learn library. 

In [2]:
features_to_use = ['is_applicant', 'response_time','convo_length',
                   'convo_num', 'num_sentence', 'num_words',
                   'expected_sentiment_text', 'sigma_sentiment_text',
                   'freq-very_negative', 'freq-negative',
                   'freq-neutral', 'freq-positive', 'freq-very_positive']
X = df[features_to_use]
y = df['is_lease'].replace({'Leased' :1 , 
                            'Leased Other Room':1, 
                            'No Lease':0})

In [None]:
# D
X_train = X[:164645]
y_train = y[:164645]

X_test = X[164645:]
y_test = y[164645:]

In [None]:
## Transforming the categorical features

transformer_name = 'ohe_on_all_categorical_features'
transformer = OneHotEncoder(sparse=False)
columns_to_encode = ['is_applicant']

ohe_final = ColumnTransformer([
            (transformer_name, transformer, columns_to_encode)], 
            remainder='passthrough')

ohe_final.fit_transform(X);

In [None]:
# Dividing the data set into train and test (90%:10%),
# keeping messages of the same channel in the same set. 

X_train = X[:164645]
y_train = y[:164645]

X_test = X[164645:]
y_test = y[164645:]

In [None]:
## set up evaluation functions for predictions 

def model_evaluation(model, X, y_true):
    y_pred = model.predict(X)
    scores = {}
    scores['accuracy'] = round(metrics.accuracy_score(y_true, y_pred), 4)
    scores['precision'] = round(metrics.precision_score(y_true, y_pred), 4)
    scores['recall'] = round(metrics.recall_score(y_true, y_pred), 4)
    probs = model.predict_proba(X).T[1]
    precisions, recalls, thresholds = metrics.precision_recall_curve(y_true, probs)
    scores['area under precision-recall curve'] = round(metrics.auc(recalls, precisions), 4)
    return scores

def print_model_evaluation(model_name, scores):
    print('{} evaluation \n'.format(model_name))
    for metric, score in scores.items():
        print('Test {}: {}'.format(metric, score))

In [None]:
## Use randomized search CV to ... 

rf_pipe = Pipeline([('ohe', ohe_final),
                    ('rf', RandomForestClassifier())])

random_grid = {'n_estimators': [int(x) for x in np.linspace(200, 2000, 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': range(1, 10),
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}


rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                               param_distributions = random_grid,
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = 2)

In [None]:
rf_pipe = Pipeline([('ohe', ohe_final),
                    ('rf', RandomForestClassifier(n_estimators=1000, random_state=42))])

rf_gs = GridSearchCV(rf_pipe, 
                     cv=5, 
                     param_grid={'rf__max_depth': range(1, 10),
                                 'rf__n_estimators': [int(x) for x in np.linspace(200, 2000, 10)],
                                 'rf__bootstrap' : [True, False]}
                    )


rf_gs.fit(X_train, y_train)
print("The best hyperparameter value is: ", rf_gs.best_params_)

rf_gs_scores = model_evaluation(rf_gs, X_test, y_test)
print_model_evaluation('Random forest', rf_gs_scores)