# Part 2. Modeling

In [1]:
import pandas as pd
df = pd.read_csv('./mbti_data.csv')

In [2]:
df['i_e'] = df.type.apply(lambda x: x[0])
df['s_n'] = df.type.apply(lambda x: x[1])
df['t_f'] = df.type.apply(lambda x: x[2])
df['j_p'] = df.type.apply(lambda x: x[3])

In [3]:
df['i_e'] = df['i_e'].apply(lambda x: 1 if x == 'E' else 0)
df['s_n'] = df.s_n.apply(lambda x: 1 if x == 'N' else 0)
df['t_f'] = df.t_f.apply(lambda x: 1 if x == 'F' else 0)
df['j_p'] = df.j_p.apply(lambda x: 1 if x == 'P' else 0)

In [4]:
df.shape

(8675, 6)

In [5]:
X = df.posts
y_i_e = df.i_e
y_s_n = df.s_n
y_t_f = df.t_f
y_j_p = df.j_p

In [6]:
#best CountVectorizer & model were in "testing models" notebook

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
best_cv = CountVectorizer(stop_words='english',max_features=2500,strip_accents='unicode', min_df=2,ngram_range=(1,2))
best_cv.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=2500, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents='unicode', token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
cv = pd.DataFrame(best_cv.transform(X).todense(),
                   columns = best_cv.get_feature_names())

## MODEL PERFORMANCE ACROSS ALL 4 Personality Aspects
### ** notice that iNtuition is over-represented, as well as Extraversion

### BASELINE E/I: .76956
#### E/I modeling: 
- GB: .8455; 
- ADA: .8333; 
- RDMFST: .7803

### BASELINE N/S: .862017
#### N/S modeling: 
- GB .894559
- ADA .8818
- RDMFST .85268

### BASELINE T/F: .54109
#### T/F modeling:
- GB .831077
- ADA .80095
- ** LogReg: .777127 **
- **DecisionTree: .7017966**
- RDMFST .68967

### BASELINE J/P: .60414
#### J/P modeling:
- GB .79419
- ADA .76391
- ** LogReg .731016 **
- ** DecisionTree .69489**
- RDMFST .6407

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, train_test_split

In [11]:
gbc = GradientBoostingClassifier()
kfold = StratifiedKFold()

In [12]:
# #run gridsearch across GBC to maximize model performance, using random search to minimize time spent
# gbc_params = {"max_depth" :[5,25,50,100,150,None],
#               'min_samples_split': [2,5,6,10,15,20],
#               "n_estimators": [1,3,7,15,30,50,75,100,150],
#                'learning_rate':[0.5,1]
#              }
# grid_gbc = GridSearchCV(gbc, param_grid=gbc_params, verbose=2, cv=kfold)


In [13]:
# grid_gbc.fit(cv,y_i_e)

In [14]:
# grid_gbc.best_score_
# grid_gbc.best_params_

In [15]:
# rand_gbc = RandomizedSearchCV(gbc, gbc_params, n_iter=20,verbose=2, cv=kfold)

In [16]:
# rand_gbc.fit(cv,y_i_e)

In [17]:
# rand_gbc.best_score_
# rand_gbc.best_params_

In [18]:
# new_params = {"max_depth" :[125,150,200],
#               'min_samples_split': [20,25,30],
#               "n_estimators": [125,150,200],
#                'learning_rate':[0.5]
#              }
# new_rand_gbc = RandomizedSearchCV(gbc, new_params, n_iter=20, verbose=2, cv=kfold)

In [19]:
# new_rand_gbc.fit(cv,y_i_e)

In [20]:
# new_rand_gbc.best_score_
# new_rand_gbc.best_params_

# GradientBoosting Maximize using GridSearch
(GridSearch #'ed out so as not to run it every time)

In [21]:
rand_best_ie = GradientBoostingClassifier(learning_rate=.5,max_depth=150,min_samples_split=20,n_estimators=150)

In [22]:
rand_best_ie.fit(cv,y_i_e)
#interrupted to go with the cross-val-score instead

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=150,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=150,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [23]:
Xe_train, Xe_test, ye_train, ye_test = train_test_split(cv,y_i_e)

In [24]:
cv_results = cross_val_score(rand_best_ie, Xe_train, ye_train, cv=kfold)

In [25]:
msg = "%s: %f (%f)" % (rand_best_ie, cv_results.mean(), cv_results.std())
print msg

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=150,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=150,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False): 0.839380 (0.004819)


In [26]:
print "Mean Performance: ", cv_results.mean()*100, "+/-",cv_results.std()

Mean Performance:  83.9379688748478 +/- 0.0048190401417660435


In [27]:
#have to test
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# Check the Naive Bayes Models 
(they don't work better than GBC)

In [28]:

models = []
models.append(('Gauss', GaussianNB()))
models.append(('Mult', MultinomialNB()))
models.append(('Bern',BernoulliNB()))

# Evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = StratifiedKFold()
    cv_results = cross_val_score(model, Xe_train, ye_train, cv=kfold, scoring=scoring)
    #cv500_results = cross_val_score(model, X500_train, y500_e_train, cv=kfold, scoring=scoring)
    #cv1000_results = cross_val_score(model, X1000_train, y1000_e_train, cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Gauss: 0.701967 (0.000282)
Mult: 0.736860 (0.021119)
Bern: 0.739626 (0.006083)


### Gauss, Mult, & Bern did not perform better than the GBC

Gauss: 0.698891 (0.014470)
Mult: 0.735933 (0.025666)
Bern: 0.745155 (0.017077)

In [29]:
X = df.posts
y_i_e = df.i_e
y_s_n = df.s_n
y_t_f = df.t_f
y_j_p = df.j_p

In [30]:
#testing on s-n
rand_best_ie.fit(cv,y_s_n)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=150,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=150,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [31]:
X_train, X_test, yn_train, yn_test = train_test_split(cv,y_s_n)

In [32]:
trainn = cross_val_score(rand_best_ie, X_train, yn_train, cv=kfold)
print trainn, trainn.mean(), trainn.std()

[0.87788018 0.89160517 0.88929889] 0.8862614144574626 0.006000749268465766


In [33]:
testn = cross_val_score(rand_best_ie, X_test, yn_test, cv=kfold)
print "Testn:", testn, testn.mean(), testn.std()

Testn: [0.87430939 0.88381743 0.87534626] 0.8778243600129657 0.004258827403425486


In [34]:
#testing on t-f
rand_best_ie.fit(cv,y_t_f)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=150,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=150,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [35]:
X_train, X_test, yf_train, yf_test = train_test_split(cv,y_t_f)

In [36]:
trainf = cross_val_score(rand_best_ie, X_train, yf_train, cv=kfold)
print "Trainf", trainf, trainf.mean(), trainf.std()

Trainf [0.80875576 0.80765683 0.81319188] 0.8098681562852494 0.0023926662867814175


In [37]:
testf = cross_val_score(rand_best_ie, X_test, yf_test, cv=kfold)
print "Testf",testf, testf.mean(), testf.std

Testf [0.7859116  0.79114799 0.78254848] 0.786536024377245 <built-in method std of numpy.ndarray object at 0x1a167d6e90>


In [38]:
#testing on j-p
rand_best_ie.fit(cv,y_j_p)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=150,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=150,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [39]:
X_train, X_test, yp_train, yp_test = train_test_split(cv,y_j_p)

In [40]:
trainp = cross_val_score(rand_best_ie, X_train, yp_train, cv=kfold)
print "Trainp", trainp, trainp.mean(), trainp.std()

Trainp [0.78801843 0.7947417  0.77629151] 0.7863505478372756 0.007624028164284099


In [41]:
testp = cross_val_score(rand_best_ie, X_test, yp_test, cv=kfold)
print "Testp", testp,testp.mean(),testp.std()

Testp [0.77071823 0.76210235 0.7465374 ] 0.7597859931600174 0.010006742486894133


## Sentiment Analysis

In [43]:
from nltk.tag import pos_tag
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

In [44]:
import spacy
en_nlp = spacy.load('en')

In [50]:
df.posts.values[0:1]

array(["'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.

In [60]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
for sentence in df.posts.values[0:1]:
    vs = analyzer.polarity_scores(unicode(sentence))
    print sentence
    print vs

'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...|

In [None]:
df['clean_post'] = ""

In [None]:
#clean the data:
import re

for x in range(len(df.posts)):
    df.clean_post[x] = df.posts[x].replace("|||", "")
#     df.clean_post[x] = df.posts[x].replace ("...","ellipses")
#     df.clean_post[x] = re.sub(r'^https?:\/\/.*[\r\n]*', '', df.posts[x], flags=re.MULTILINE)

In [None]:
df.posts

In [61]:
from textacy.preprocess import preprocess_text

In [63]:
[preprocess_text(x, fix_unicode = True, lowercase=True, no_urls=True) for x in  df.posts]

UnicodeError: Hey wait, this isn't Unicode.

ftfy is designed to fix problems that were introduced by handling Unicode
incorrectly. It might be able to fix the bytes you just handed it, but the
fact that you just gave a pile of bytes to a function that fixes text means
that your code is *also* handling Unicode incorrectly.

ftfy takes Unicode text as input. You should take these bytes and decode
them from the encoding you think they are in. If you're not sure what encoding
they're in:

- First, try to find out. 'utf-8' is a good assumption.
- If the encoding is simply unknowable, try running your bytes through
  ftfy.guess_bytes. As the name implies, this may not always be accurate.

If you're confused by this, please read the Python Unicode HOWTO:

    http://docs.python.org/2/howto/unicode.html


In [1]:
# At this point i decided to go a different route, and make a separate Sentiment Analysis Page