In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pratikpattanaik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
class DataPrep():
    def __init__(self):
        self.trait_cat_dict = {
            'O': 'cOPN',
            'C': 'cCON',
            'E': 'cEXT',
            'A': 'cAGR',
            'N': 'cNEU',
            'OPN': 'cOPN',
            'CON': 'cCON',
            'EXT': 'cEXT',
            'AGR': 'cAGR',
            'NEU': 'cNEU',
            'Openness': 'cOPN',
            'Conscientiousness': 'cCON',
            'Extraversion': 'cEXT',
            'Agreeableness': 'cAGR',
            'Neuroticism': 'cNEU'
            }
        self.trait_score_dict = {
            'O': 'sOPN',
            'C': 'sCON',
            'E': 'sEXT',
            'A': 'sAGR',
            'N': 'sNEU',
            'OPN': 'sOPN',
            'CON': 'sCON',
            'EXT': 'sEXT',
            'AGR': 'sAGR',
            'NEU': 'sNEU',
            'Openness': 'sOPN',
            'Conscientiousness': 'sCON',
            'Extraversion': 'sEXT',
            'Agreeableness': 'sAGR',
            'Neuroticism': 'sNEU'
            }

    def prep_data(self, type, trait, regression=False, model_comparison=False):
        dict_traits = {'OPN':2000, 'CON':2000, 'EXT':2000, 'AGR':1800, 'NEU':2000}
        df_tweet = self.prep_status_data().head(dict_traits[trait])
        # df_essay = self.prep_essay_data()

        tfidf = TfidfVectorizer(stop_words='english', strip_accents='ascii')
            
            
#             print(X_temp)
#             X_temp = pd.DataFrame(X_temp)
#             print(X_temp.dtypes)
#             print(df_tweet['TWEET'].dtypes)
        X_temp = df_tweet['TWEET']
        X = tfidf.fit_transform(X_temp).todense()

#             X = df_tweet['TWEET']

        if regression:
            y_column = self.trait_score_dict[trait]
        else:
            y_column = self.trait_cat_dict[trait]
        y = df_tweet[y_column]

#         print(X)
        return X, y


    def prep_status_data(self):
        df = pd.read_csv('mypersonality_final.csv', encoding="ISO-8859-1")
#         df = df1.head(3000)
        df.rename(columns = {'STATUS':'TWEET'}, inplace = True)
        for i in range(0, len(df['TWEET'])):
            df_tweets = re.sub('[^a-zA-Z]', ' ', df['TWEET'][i])
            df_tweets = df_tweets.lower()
            df_tweets = df_tweets.split()
            ps = PorterStemmer()
            all_stopwords = stopwords.words('english')
            all_stopwords.remove('not')
            df_tweets = [ps.stem(word) for word in df_tweets if not word in set(all_stopwords)]
            df['TWEET'][i] = ' '.join(df_tweets)
            corpus.append(df_tweets)
        
#         df['TWEET'] = df.groupby(['#AUTHID'])['TWEET'].transform(lambda x : ' '.join(x))
        other_features_columns = [
                'NETWORKSIZE',
                'BETWEENNESS',
                'NBETWEENNESS',
                'DENSITY',
                'BROKERAGE',
                'NBROKERAGE',
                'TRANSITIVITY',
                'DATE',
                '#AUTHID'
            ]
        df.drop(other_features_columns, axis=1, inplace=True)
        df.drop_duplicates(inplace=True)
        df = self.convert_traits_to_boolean(df)
#         print(df)
        return df


    def convert_traits_to_boolean(self, df):
        trait_columns = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']
        d = {'y': True, 'n': False}

        for trait in trait_columns:
            df[trait] = df[trait].map(d)

        return df


    def load_data(self, filepath):
        return pd.read_csv(filepath, encoding="ISO-8859-1")

In [4]:
# rfr = RandomForestRegressor(bootstrap=True,
#          max_features='sqrt',
#          min_samples_leaf=1,
#          min_samples_split=2,
#          n_estimators= 200)

# tfidf = TfidfVectorizer(stop_words='english', strip_accents='ascii')

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
# from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

class ModelEvaluator():
    def __init__(self, X, y, trait):
        self.X = X
        self.y = y
        self.trait = trait
        self.models_dict = {
            'RandomForestClassifier': RandomForestClassifier(criterion='gini', max_depth=4, max_features='auto', n_estimators=250),
        }
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.hyperparameters = {
        'RandomForestClassifier': {'max_features': 'sqrt', 'n_estimators': 200},
        }
        self.accuracy_scores = {'RandomForestClassifier':0, 'MultinomialNB':0, 'SVC':0, 'XGBClassifier':0}

    def tune_hyperparameters(self, model):
        traits = ['O', 'C', 'E', 'A', 'N']
        trait_best_params_dict = {}
        for trait in traits:
            if model == 'RandomForestRegressor':

                # Number of trees in random forest
                n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
                # Number of features to consider at every split
                max_features = ['auto', 'sqrt']
                # Maximum number of levels in tree
                max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
                max_depth.append(None)
                # Minimum number of samples required to split a node
                min_samples_split = [2, 5, 10]
                # Minimum number of samples required at each leaf node
                min_samples_leaf = [1, 2, 4]
                # Method of selecting samples for training each tree
                bootstrap = [True, False]
                # Create the random grid
                random_grid = {'n_estimators': n_estimators,
                               'max_features': max_features,
                               # 'max_depth': max_depth,
                               # 'min_samples_split': min_samples_split,
                               # 'min_samples_leaf': min_samples_leaf,
                               # 'bootstrap': bootstrap
                               }


                # Use the random grid to search for best hyperparameters
                # First create the base model to tune
                rf = RandomForestRegressor()
                # Random search of parameters, using 3 fold cross validation,

                # search across 100 different combinations, and use all available cores
                # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

                rf_GSCV = GridSearchCV(estimator=rf, param_grid=random_grid, cv=5)

                # Fit the random search model
                rf_GSCV.fit(self.X, self.y)
                print('Personality ' + trait + ' best params: ' )
                for k, v in rf_GSCV.best_params_:
                    print (k + ': ' + v)
                trait_best_params_dict[trait] = rf_GSCV.best_params_

        return trait_best_params_dict

    def compare_scores(self, models, regression=False):
        print('Model performance for trait->' + self.trait + ' prediction:' + '\n')

        
        
        mse_regression = []

        for model_name in models:
            model = self.models_dict[model_name]
            model.fit(self.X_train, self.y_train)

            print("\t" + model_name + ": ")

            if regression:
                y_pred = model.predict(self.X_test)
                y_true = self.y_test
                mse = -np.mean(cross_validate(model, self.X_test, self.y_test, scoring='neg_mean_squared_error', cv=10)['test_score'])
                mse_regression.append({model_name : mse})
                print('\t\tMSE: ' + str(mse))
            else:
                accuracy_score = np.mean(cross_validate(model, self.X_test, self.y_test, cv=10)['test_score'])
                accuracy_score = round(accuracy_score, 3)
#                 self.accuracy_scores[model_name] += accuracy_score/5
                print('\t\tAccuracy score: ' + str(accuracy_score))


#         if regression:
#             return mse_regression
#             pass
#         else:
#             best_accuracy_score = max(accuracy_scores)
#             best_accuracy_model = models[accuracy_scores.index(best_accuracy_score)]
#             print(
#                 '\tBest Accuracy score: ' + str(best_accuracy_score) + '\n' +
#                 '\tModel: ' + best_accuracy_model  + '\n'
#             )
        return self.accuracy_scores


if __name__ == '__main__':
    
    
    traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
    
    data_mse = []
    data_accuracy_score = []
    data_f1_score = []
    
    for trait in traits:        
        dp = DataPrep()
        X_classification, y_classification = dp.prep_data('tweet', trait, regression=False, model_comparison=False)
        X_regression, y_regression = dp.prep_data('tweet', trait, regression=False, model_comparison=False)

        
        M1 = ModelEvaluator(X_classification, y_classification, trait)
        M2 = ModelEvaluator(X_regression, y_regression, trait)

        models_classifier = ['RandomForestClassifier']     
#         models_regressor = ['LogisticRegression', 'LinearRegression', 'RandomForestRegressor', 'Ridge', 'SGDRegressor', 'MLPRegressor', 'HistGradientBoostingRegressor', 'XGBRegressor']

        accuracy_score = M1.compare_scores(models_classifier)
#         data_accuracy_score.append({trait: accuracy_score})
#         data_f1_score.append({trait: f1_score})
        
        
#         mse = M2.compare_scores(models_regressor, regression=True)
#         data_mse.append({trait: mse})

Model performance for trait->OPN prediction:

	RandomForestClassifier: 
		Accuracy score: 0.91
Model performance for trait->CON prediction:

	RandomForestClassifier: 
		Accuracy score: 0.732
Model performance for trait->EXT prediction:

	RandomForestClassifier: 
		Accuracy score: 0.747
Model performance for trait->AGR prediction:

	RandomForestClassifier: 
		Accuracy score: 0.711
Model performance for trait->NEU prediction:

	RandomForestClassifier: 
		Accuracy score: 0.635
