In [74]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [86]:
from sklearn.preprocessing import MinMaxScaler

class DataPrep():
    def __init__(self):
        self.trait_cat_dict = {
            'O': 'cOPN',
            'C': 'cCON',
            'E': 'cEXT',
            'A': 'cAGR',
            'N': 'cNEU',
            'OPN': 'cOPN',
            'CON': 'cCON',
            'EXT': 'cEXT',
            'AGR': 'cAGR',
            'NEU': 'cNEU',
            'Openness': 'cOPN',
            'Conscientiousness': 'cCON',
            'Extraversion': 'cEXT',
            'Agreeableness': 'cAGR',
            'Neuroticism': 'cNEU'
            }
        self.trait_score_dict = {
            'O': 'sOPN',
            'C': 'sCON',
            'E': 'sEXT',
            'A': 'sAGR',
            'N': 'sNEU',
            'OPN': 'sOPN',
            'CON': 'sCON',
            'EXT': 'sEXT',
            'AGR': 'sAGR',
            'NEU': 'sNEU',
            'Openness': 'sOPN',
            'Conscientiousness': 'sCON',
            'Extraversion': 'sEXT',
            'Agreeableness': 'sAGR',
            'Neuroticism': 'sNEU'
            }
        self.LIWC_features = [
            'WPS', 'Unique', 'Dic', 'Sixltr', 'Negate', 'Assent', 'Article', 'Preps', 'Number',
            'Pronoun', 'I', 'We', 'Self', 'You', 'Other',
            'Affect', 'Posemo', 'Posfeel', 'Optim', 'Negemo', 'Anx', 'Anger', 'Sad',
            'Cogmech', 'Cause', 'Insight', 'Discrep', 'Inhib', 'Tentat', 'Certain',
            'Senses', 'See', 'Hear', 'Feel',
            'Social', 'Comm', 'Othref', 'Friends', 'Family', 'Humans',
            'Time', 'Past', 'Present', 'Future',
            'Space', 'Up', 'Down', 'Incl', 'Excl', 'Motion',
            'Occup', 'School', 'Job', 'Achieve',
            'Leisure', 'Home', 'Sports', 'TV', 'Music',
            'Money',
            'Metaph', 'Relig', 'Death', 'Physcal', 'Body', 'Sexual', 'Eating', 'Sleep', 'Groom',
            'Allpct', 'Period', 'Comma', 'Colon', 'Semic', 'Qmark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'Otherp',
            'Swear', 'Nonfl', 'Fillers',
        ]

    def prep_data(self, type, trait, regression=False, model_comparison=False):
        df_tweet = self.prep_status_data()
        # df_essay = self.prep_essay_data()

        tfidf = TfidfVectorizer(stop_words='english', strip_accents='ascii')

        if type == 'tweet':
            # Include other features with tfidf vector
            other_features_columns = [
                'NETWORKSIZE',
                'BETWEENNESS',
                'NBETWEENNESS',
                'DENSITY',
                'BROKERAGE',
                'NBROKERAGE',
                'TRANSITIVITY'
            ]
#             X = tfidf.fit_transform(df_tweet['TWEET']).todense()

            X = df_tweet['TWEET']

            if regression:
                y_column = self.trait_score_dict[trait]
            else:
                y_column = self.trait_cat_dict[trait]
            y = df_tweet[y_column]

#         print(y)
        return X, y


    def prep_status_data(self):
        df = pd.read_csv('mypersonality_final.csv', encoding="ISO-8859-1")
        df.rename(columns = {'STATUS':'TWEET'}, inplace = True)
        df = self.convert_traits_to_boolean(df)
#         print(df)
        return df


    def convert_traits_to_boolean(self, df):
        trait_columns = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']
        d = {'y': True, 'n': False}

        for trait in trait_columns:
            df[trait] = df[trait].map(d)

        return df


    def load_data(self, filepath):
        return pd.read_csv(filepath, encoding="ISO-8859-1")
    


In [87]:
dp = DataPrep()
dp.prep_data('tweet', 'OPN', regression = True)
# print('')

(0                                                                                                                                                               likes the sound of thunder.
 1                                                                                                                           is so sleepy it's not even funny that's she can't get to sleep.
 2                                                                 is sore and wants the knot of muscles at the base of her neck to stop hurting. On the other hand, YAY I'M IN ILLINOIS! <3
 3                                                                                                                                                likes how the day sounds in this new song.
 4                                                                                                                                                                               is home. <3
                                                       

In [109]:
class Model():
    def __init__(self):
        self.rfr = RandomForestRegressor(bootstrap=True,
         max_features='sqrt',
         min_samples_leaf=1,
         min_samples_split=2,
         n_estimators= 200)
        self.rfc = RandomForestClassifier(max_features='sqrt', n_estimators=110)
        self.tfidf = TfidfVectorizer(stop_words='english', strip_accents='ascii')

    def fit(self, X, y, regression=True):
        X = self.tfidf.fit_transform(X)
        if regression:
            self.rfr = self.rfr.fit(X, y)
        else:
            self.rfc = self.rfc.fit(X, y)

    def predict(self, X, regression=True):
        X = self.tfidf.transform(X)
#         print(X)
        if regression:
            return self.rfr.predict(X)
        else:
            return self.rfc.predict(X)

    def predict_proba(self, X, regression=False):
        X = self.tfidf.transform(X)
        if regression:
            raise ValueError('Cannot predict probabilites of a regression!')
        else:
            return self.rfc.predict_proba(X)*100

if __name__ == '__main__':
    traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
    model = Model()

    for trait in traits:
        dp = DataPrep()
        X_regression, y_regression = dp.prep_data('tweet', trait, regression=True, model_comparison=False)
        X_categorical, y_categorical = dp.prep_data('tweet', trait, regression=False, model_comparison=False)
        print('Fitting trait ' + trait + ' regression model...')
        model.fit(X_regression, y_regression, regression=True)
        print('Done!')
        print('Fitting trait ' + trait + ' categorical model...')
        model.fit(X_categorical, y_categorical, regression=False)
        print('Done!')
        with open('static/' + trait + '_model.pkl', 'wb') as f:
            # Write the model to a file.
            pickle.dump(model, f)

Fitting trait OPN regression model...
Done!
Fitting trait OPN categorical model...
Done!
Fitting trait CON regression model...
Done!
Fitting trait CON categorical model...
Done!
Fitting trait EXT regression model...
Done!
Fitting trait EXT categorical model...
Done!
Fitting trait AGR regression model...
Done!
Fitting trait AGR categorical model...
Done!
Fitting trait NEU regression model...
Done!
Fitting trait NEU categorical model...
Done!


In [110]:
# import pandas as pd
# import numpy as np
# import scipy.stats as stats

# class Big5():
# 	def __init__(self):
# 		self.df = pd.read_csv('data/BIG5/data.csv', sep='\t')
# 		self.prep_df()
# 		self.questions_key = {
# 			'I am the life of the party.': 'E1',
# 			"I don't talk a lot.": 'E2',
# 			'I feel comfortable around people.': 'E3',
# 			'I keep in the background.': 'E4',
# 			'I start conversations.': 'E5',
# 			'I have little to say.': 'E6',
# 			'I talk to a lot of different people at parties.': 'E7',
# 			"I don't like to draw attention to myself.": 'E8',
# 			"I don't mind being the center of attention.": 'E9',
# 			'I am quiet around strangers.': 'E10',
# 			'I get stressed out easily.': 'N1',
# 			'I am relaxed most of the time.': 'N2',
# 			'I worry about things.': 'N3',
# 			'I seldom feel blue.': 'N4',
# 			'I am easily disturbed.': 'N5',
# 			'I get upset easily.': 'N6',
# 			'I change my mood a lot.': 'N7',
# 			'I have frequent mood swings.': 'N8',
# 			'I get irritated easily.': 'N9',
# 			'I often feel blue.': 'N10',
# 			'I feel little concern for others.': 'A1',
# 			'I am interested in people.': 'A2',
# 			'I insult people.': 'A3',
# 			"I sympathize with others' feelings.": 'A4',
# 			"I am not interested in other people's problems.": 'A5',
# 			'I have a soft heart.': 'A6',
# 			'I am not really interested in others.': 'A7',
# 			'I take time out for others.': 'A8',
# 			"I feel others' emotions.": 'A9',
# 			'I make people feel at ease.': 'A10',
# 			'I am always prepared.': 'C1',
# 			'I leave my belongings around.': 'C2',
# 			'I pay attention to details.': 'C3',
# 			'I make a mess of things.': 'C4',
# 			'I get chores done right away.': 'C5',
# 			'I often forget to put things back in their proper place.': 'C6',
# 			'I like order.': 'C7',
# 			'I shirk my duties.': 'C8',
# 			'I follow a schedule.': 'C9',
# 			'I am exacting in my work.': 'C10',
# 			'I have a rich vocabulary.': 'O1',
# 			'I have difficulty understanding abstract ideas.': 'O2',
# 			'I have a vivid imagination.': 'O3',
# 			'I am not interested in abstract ideas.': 'O4',
# 			'I have excellent ideas.': 'O5',
# 			'I do not have a good imagination.': 'O6',
# 			'I am quick to understand things.': 'O7',
# 			'I use difficult words.': 'O8',
# 			'I spend time reflecting on things.': 'O9',
# 			'I am full of ideas.': 'O10',
# 		}

# 	def handle_personality_test(self, answers):
# 		answer_dict = {}
# 		for question, answer in answers.items():
# 			key = self.questions_key[question]
# 			answer_dict[key] = answer

# 		score_dict = {'O_score': 0, 'C_score': 0, 'E_score': 0, 'A_score': 0, 'N_score': 0}
# 		for trait_key, answer in answer_dict.items():
# 			if 'O' in trait_key:
# 				score_dict['O_score'] += answer
# 			if 'C' in trait_key:
# 				score_dict['C_score'] += answer
# 			if 'E' in trait_key:
# 				score_dict['E_score'] += answer
# 			if 'A' in trait_key:
# 				score_dict['A_score'] += answer
# 			if 'N' in trait_key:
# 				score_dict['N_score'] += answer	

# 		for key, score in score_dict.items():
# 			score_dict[key] = score/10

# 		perc_dict = {}
# 		for key, score in score_dict.items():
# 			if key == 'O_score':
# 				perc = stats.percentileofscore(self.df[key], score)
# 				perc_dict['O_perc'] = perc
# 			if key == 'C_score':
# 				perc = stats.percentileofscore(self.df[key], score)
# 				perc_dict['C_perc'] = perc
# 			if key == 'E_score':
# 				perc = stats.percentileofscore(self.df[key], score)
# 				perc_dict['E_perc'] = perc
# 			if key == 'A_score':
# 				perc = stats.percentileofscore(self.df[key], score)
# 				perc_dict['A_perc'] = perc
# 			if key == 'N_score':
# 				perc = stats.percentileofscore(self.df[key], score)
# 				perc_dict['N_perc'] = perc

# 		result_dict = {}
# 		result_dict['percentiles'] = perc_dict
# 		result_dict['scores'] = score_dict
		
# 		return(result_dict)

# 	def calc_score(self, df):
# 	    score = []
# 	    for row in df.values:
# 	        score.append(row.mean())
# 	    return score

# 	def prep_df(self):
# 		O_columns = ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10']
# 		C_columns = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']
# 		E_columns = ['E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10']
# 		A_columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10']
# 		N_columns = ['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10']

# 		self.df['O_score'] = self.calc_score(self.df[O_columns])
# 		self.df['C_score'] = self.calc_score(self.df[C_columns])
# 		self.df['E_score'] = self.calc_score(self.df[E_columns])
# 		self.df['A_score'] = self.calc_score(self.df[A_columns])
# 		self.df['N_score'] = self.calc_score(self.df[N_columns])

		

In [111]:
import tweepy
import configparser
import pandas as pd
import re

def remove_usernames_links(tweet):
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('http[^\s]+','',tweet)
    return tweet

def fetch_tweets(username):
    # read configs
    config = configparser.ConfigParser()
    config.read('config.ini')

    api_key = config['twitter']['api_key']
    api_key_secret = config['twitter']['api_key_secret']

    access_token = config['twitter']['access_token']
    access_token_secret = config['twitter']['access_token_secret']
    bearer_token = 'AAAAAAAAAAAAAAAAAAAAAJCXiAEAAAAAlMOQc2O%2BbZib8Uw2sld4MLomM0E%3D4ePyxuoasHqc5YPP4NKRz6YwfYCbVVepKa1bh4pG3ljJiN4daS'
    #Put your Bearer Token in the parenthesis below
    client = tweepy.Client(bearer_token)
    
    query = 'from:'+ username + ' -is:retweet lang:en'
    tweets = tweepy.Paginator(client.search_recent_tweets, query=query,
                                  tweet_fields=['context_annotations', 'created_at'], max_results=100).flatten(limit=1000)


    columns = ['TWEETS']
    data = []

    for tweet in tweets:
        data.append(tweet.text)
    
    

    df_pred = pd.DataFrame(data, columns = columns)
    df_pred = df_pred.replace("\s+", " ", regex=True).apply(lambda x: x.str.strip())


    df_pred['TWEETS'] = df_pred['TWEETS'].apply(remove_usernames_links)
#     print(df_pred)
    return df_pred

# pd.set_option('display.max_colwidth', None)
# print(df_pred)

In [112]:
class Predictor():
    def __init__(self):

        self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
        self.models = {}
        self.load_models()
#         self.df = self.load_df()
        
    def load_models(self):
        M = Model()
        for trait in self.traits:
            with open('static/' + trait + '_model.pkl', 'rb') as f:
                self.models[trait] = pickle.load(f)

    def predict(self, X, traits='All', predictions='All'):
        predictions = {}
        if traits == 'All':
            for trait in self.traits:
                pkl_model = self.models[trait]

                
                trait_scores = pkl_model.predict(X, regression=True).reshape(1, -1)
                # scaler = MinMaxScaler(feature_range=(0, 50))
                # print(scaler.fit_transform(trait_scores))
                # scaled_trait_scores = scaler.fit_transform(trait_scores)
                predictions['pred_s'+trait] = trait_scores.flatten()[0]
                # predictions['pred_s'+trait] = scaled_trait_scores.flatten()

                trait_categories = pkl_model.predict(X, regression=False)
                predictions['pred_c'+trait] = str(trait_categories[0])
                # predictions['pred_c'+trait] = trait_categories

                trait_categories_probs = pkl_model.predict_proba(X)
                predictions['pred_prob_c'+trait] = trait_categories_probs[:, 1][0]
                # predictions['pred_prob_c'+trait] = trait_categories_probs[:, 1]

        return predictions

In [136]:
import numpy as np
import matplotlib.pyplot as plt
 

def bar_plot(y_pred, title=""):
    # creating the dataset
    data = {'Openness': y_pred['pred_sOPN'], 'Conscientiousness': y_pred['pred_sCON'], 'Aggreableness': y_pred['pred_sAGR'], 'Extraversion': y_pred['pred_sEXT'], 'Neuroticism': y_pred['pred_sNEU']}
    keys = list(data.keys())
    values = list(data.values())


    # # creating the bar plot
    # plt.bar(keys, values, color ='maroon',
    #         width = 0.6)

    # for index, keys in enumerate(keys):
    #     plt.text(, index,
    #              str(keys))


    # # plt.xlabel("none")
    # # plt.ylabel("No. of students enrolled")
    # plt.title("Personality Prediction for @" + username)
    # plt.show()


    x = np.arange(len(keys)) # the label locations
    width = 0.35 # the width of the bars

    fig, ax = plt.subplots()

    ax.set_ylabel('Probablity')
    ax.set_xlabel('Traits')
    ax.set_title("Personality Prediction for @" + title)
    ax.set_xticks(x)
    ax.set_xticklabels(keys)
#     ax.set_ylim([0 ,100])

    pps = ax.bar(x, values, width, label='traits', color='maroon')
    for p in pps:
       height = p.get_height()
       ax.annotate('{0:.5g}%'.format(height),
          xy=(p.get_x() + p.get_width() / 2, height),
          xytext=(0, 5), # 3 points vertical offset
          textcoords="offset points",
          ha='center', va='bottom')

    plt.show()

In [173]:
import matplotlib.pyplot as plt
import numpy as np
 
# traits = ["Openness", "Conscientiousness", "Agreeableness", "Extraversion", "Neuroticism"]

# data = {'Openness': y_pred['pred_prob_cOPN'], 'Conscientiousness': y_pred['pred_prob_cCON'], 'Aggreableness': y_pred['pred_prob_cAGR'], 'Extraversion': y_pred['pred_prob_cEXT'], 'Neuroticism': y_pred['pred_prob_cNEU']}
# actual = list(data.keys())
# expected = list(data.values())
def radial_plot(y_pred, title=""):
    data = {'Openness': y_pred['pred_prob_cOPN'], 'Conscientiousness': y_pred['pred_prob_cCON'], 'Aggreableness': y_pred['pred_prob_cAGR'], 'Extraversion': y_pred['pred_prob_cEXT'], 'Neuroticism': y_pred['pred_prob_cNEU']}
    keys = list(data.keys())
    values = list(data.values())
    
    values2 = values.copy()
    values2.append(values2[0])


    # Initialise the spider plot by setting figure size and polar projection
    plt.figure(figsize=(10, 6))
    plt.subplot(polar=True)

    theta = np.linspace(0, 2*np.pi, len(values2), endpoint=True)

    # Arrange the grid into number of sales equal parts in degrees
    lines, labels = plt.thetagrids(range(0, 360, int(360/len(keys))), (keys))

    # Plot actual sales graph
    # plt.plot(theta, actual)

    # plt.rmax(5)
    # Plot expected sales graph
    plt.plot(theta, values2)
    plt.fill(theta, values2, 'b', alpha=0.1)
    plt.ylim(0, 100)

    # Add legend and title for the plot
    # plt.legend(labels=('Actual'), loc=1)
    plt.title("Personality Prediction for @" + title)

    # Dsiplay the plot on the screen
    plt.show()
    
def radial_plot_comparision(y_pred1, y_pred2):
 
    data1 = {'Openness': y_pred1['pred_prob_cOPN'], 'Conscientiousness': y_pred1['pred_prob_cCON'], 'Aggreableness': y_pred1['pred_prob_cAGR'], 'Extraversion': y_pred1['pred_prob_cEXT'], 'Neuroticism': y_pred1['pred_prob_cNEU']}
    data2 = {'Openness': y_pred2['pred_prob_cOPN'], 'Conscientiousness': y_pred2['pred_prob_cCON'], 'Aggreableness': y_pred2['pred_prob_cAGR'], 'Extraversion': y_pred2['pred_prob_cEXT'], 'Neuroticism': y_pred2['pred_prob_cNEU']}

    
    employee = list(data1.keys())
    values = list(data1.values()).copy()
    values.append(values[0])
    values2 = list(data2.values()).copy()
    values2.append(values2[0])
    
    actual = values
    expected = values2

    # Initialise the spider plot by setting figure size and polar projection
    plt.figure(figsize=(10, 6))
    plt.subplot(polar=True)

    theta = np.linspace(0, 2 * np.pi, len(actual))

    # Arrange the grid into number of sales equal parts in degrees
    lines, labels = plt.thetagrids(range(0, 360, int(360/len(employee))), (employee))

    # Plot actual sales graph
    plt.plot(theta, actual, label='Actual')
    plt.fill(theta, actual, 'blue', alpha=0.1)

    # Plot expected sales graph
    plt.plot(theta, expected, label='Expected')
    plt.fill(theta, expected, 'orange', alpha=0.1)

    # Add legend and title for the plot
    plt.legend()
    plt.title("Actual vs Expected sales by Employee")

    # Dsiplay the plot on the screen
    plt.show()

In [174]:
def find_personality():
    P = Predictor()
    username = str(input('enter username: '))
    print('\n\n\n\n')
    X_pred = fetch_tweets(username)['TWEETS']
    y_pred = P.predict(list(X_pred))

    print(y_pred)

    bar_plot(y_pred, title = username)
    radial_plot(y_pred, title = username)
    
    return y_pred
    
def compare():
    
    y_pred1 = find_presonality()
    y_pred2 = find_personality()
    
    radial_plot_comparision(y_pred1, y_pred2)
    
    return y_pred1, y_pred2
    
    # to be continued

In [None]:
if __name__ == '__main__':
    compare()

enter username: potu
