# Start The Analysis

## 1. Download dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./datasets/mbti_data.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [3]:
df.shape

(8675, 2)

In [10]:
print df.type[643]
df.posts[643]

ENFP


"'Haha to some extent. I mean, I get along with everyone. It's just that sometimes the conversations seem superficial, yeah? There are a lot of things I find interesting that I want to talk to people...|||... Definitely need to look into that. The enneagram tritypes currently looks like a jumble of numbers to me, even though I somewhat understand the basic concepts. Also trying to figure out how you...|||Thank you for responding. These forums can be like ghost towns sometimes, depending on what time you start a thread. ;; I made a new thread in the first place, because online tests started giving me...|||0. Is there anything that may affect the way you answer the questions? For example, a stressful time, mental illness, medications, special life circumstances? Other useful information includes sex,...|||Acerbusvenator I'm responding to your post on behalf of my little sister, since she doesn't seem to know how to haha. First of all, thank you very much for taking the time to respond to

### 1.1 Summary of Data
- data is taken from public posts on "PersonalityCafe" website, where people express their feelings and experiences with relation to their Myers-Briggs personality type. The posts are very personal, and most importantly, are made by people who are self-aware (aware of their personality type).
- MBTI was established by taking a test (not specific as to where)

### 1.2 Limitations
- subset of population prone to expressing much more personal things than normal posts
- we are not aware of how recently they took their MBTI (might have changed)
- only looking at the people who posted, meaning it will highly skew towards extroverts

### 1.3 Ways to look at the data (analyzing strategies)
- how many people are writing pos? neg? neutral?
- how many people are writing about highly personal things?
    - to do: look for list of emotional words to search
- how many people are using the MBTI indicators in their posts?
- what are the top X words (not including stop-words)
- use http://www.nltk.org to look for further analysis options
    - try stemmers
    - part of speech tag (verbs or nouns)
    - puncuation
    - mispellings
    - vary the ngram

# 2. Setting up Features & Targets 

### Four Targets
1. Extroversion vs. Introversion (general attitude towards others)
2. iNtuition vs. Sensing (how a person perceives information)
    - Sensing: facts and reality
    - iNtuition: possibilites and potential
3. Feeling vs. Thinking
    - Feeling: values & relationships
    - Thinking: logic & truth
4. Perceiving vs. Judging
    - Judging: well-structured lifestyle
    - Perceiving: "going with the flow" attitude

In [4]:
#set up the four binaries 
df['i_e'] = df.type.apply(lambda x: x[0])
df['s_n'] = df.type.apply(lambda x: x[1])
df['t_f'] = df.type.apply(lambda x: x[2])
df['j_p'] = df.type.apply(lambda x: x[3])

In [5]:
df['i_e'] = df['i_e'].apply(lambda x: 1 if x == 'E' else 0)
df['s_n'] = df.s_n.apply(lambda x: 1 if x == 'N' else 0)
df['t_f'] = df.t_f.apply(lambda x: 1 if x == 'F' else 0)
df['j_p'] = df.j_p.apply(lambda x: 1 if x == 'P' else 0)

In [6]:
#set up the targets and the features
X = df.posts
y_i_e = df.i_e
y_s_n = df.s_n
y_t_f = df.t_f
y_j_p = df.j_p

# 3. CountVectorization (multiple types)
1. cv = CountVectorizer w/o manipulation or changing hyperparameters
2. X500s = 500 & stopwords
3. X1000 = 1000 & stopwords **works best for LogReg**(will see later)
4. strip = stripped
5. stop = stop words removed

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
#set up cv = CountVectorizer without any manipulation of features
cv = CountVectorizer()
cv.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
cv = pd.DataFrame(cv.transform(X).todense(),
                 columns = cv.get_feature_names())
print cv.columns
cv.shape

Index([u'00', u'000', u'0000', u'000000', u'0000000000', u'000000000000000',
       u'00000000000000000000000000000000027', u'00000011', u'000000111',
       u'0000001111',
       ...
       u'ｓｏ', u'ｔｒｕｍｐu3000ｉｓu3000ａｎu3000ｅｓｔｐ', u'ｖａｐｏｒｗａｖｅ',
       u'ｗｈｙu3000ｉｓu3000ａｎｙｏｎｅu3000ｓｔｉｌｌu3000ｄｉｓｃｕｓｓｉｎｇu3000ｔｈｉｓ', u'ﾉωﾉ',
       u'ﾉｼ', u'ﾉﾞ', u'ﾉﾟ', u'ﾟдﾟщ', u'ﾟﾟ'],
      dtype='object', length=145412)


(8675, 145412)

In [10]:
cv500s = CountVectorizer(max_features=500, stop_words='english')
cv500s.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
X500s = pd.DataFrame(cv500s.transform(X).todense(),
                 columns = cv500s.get_feature_names())
X500s.columns

Index([u'10', u'100', u'able', u'absolutely', u'act', u'actually', u'add',
       u'advice', u'age', u'ago',
       ...
       u'wrong', u'www', u'xd', u'yeah', u'year', u'years', u'yes', u'young',
       u'younger', u'youtube'],
      dtype='object', length=500)

In [115]:
X500s['target'] = df['type']
X500s['i_e'] = df.type.apply(lambda x: x[0])
X500s['s_n'] = df.type.apply(lambda x: x[1])
X500s['t_f'] = df.type.apply(lambda x: x[2])
X500s['j_p'] = df.type.apply(lambda x: x[3])

In [116]:
X500s['i_e'] = X500s['i_e'].apply(lambda x: 1 if x == 'E' else 0)
X500s['s_n'] = X500s.s_n.apply(lambda x: 1 if x == 'N' else 0)
X500s['t_f'] = X500s.t_f.apply(lambda x: 1 if x == 'F' else 0)
X500s['j_p'] = X500s.j_p.apply(lambda x: 1 if x == 'P' else 0)

In [1]:
#save out 500 features w/o stopwords
X500s.to_csv('./500feat_cv.csv')

NameError: name 'X500s' is not defined

In [13]:
cv1000 = CountVectorizer(max_features=1000, stop_words='english')
cv1000.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
X1000 = pd.DataFrame(cv1000.transform(X).todense(),
                 columns = cv1000.get_feature_names())

In [15]:
#set up X1000 for model work:
X1000['target'] = df['type']
X1000['i_e'] = df.type.apply(lambda x: x[0])
X1000['s_n'] = df.type.apply(lambda x: x[1])
X1000['t_f'] = df.type.apply(lambda x: x[2])
X1000['j_p'] = df.type.apply(lambda x: x[3])

In [16]:
X1000['i_e'] = X1000['i_e'].apply(lambda x: 1 if x == 'E' else 0)
X1000['s_n'] = X1000.s_n.apply(lambda x: 1 if x == 'N' else 0)
X1000['t_f'] = X1000.t_f.apply(lambda x: 1 if x == 'F' else 0)
X1000['j_p'] = X1000.j_p.apply(lambda x: 1 if x == 'P' else 0)

In [17]:
#save 1000 features, stopwords
X1000.to_csv('./1000feat_cv.csv')

In [123]:
#set up countvectorizer with just the stop words
stop = CountVectorizer(stop_words='english',max_features=10000)
stop.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [124]:
stop = pd.DataFrame(stop.transform(X).todense(),
                   columns = stop.get_feature_names())


In [125]:
#set up stop for model work:
stop['target'] = df['type']
stop['i_e'] = df.type.apply(lambda x: x[0])
stop['s_n'] = df.type.apply(lambda x: x[1])
stop['t_f'] = df.type.apply(lambda x: x[2])
stop['j_p'] = df.type.apply(lambda x: x[3])

In [126]:
stop['i_e'] = stop['i_e'].apply(lambda x: 1 if x == 'E' else 0)
stop['s_n'] = stop.s_n.apply(lambda x: 1 if x == 'N' else 0)
stop['t_f'] = stop.t_f.apply(lambda x: 1 if x == 'F' else 0)
stop['j_p'] = stop.j_p.apply(lambda x: 1 if x == 'P' else 0)

In [127]:
stop.to_csv('./stop_cv.csv')

## 3.1 CountVecotorized features and MAX  Model Scores
(models were tested outside of the notebook on GradientBooster only to conserve space)

 - #3000 features, stopwords and strip accents = .796034
 - #2500 features, stopwords, strip accents, mindf=2, ngram=(1,2) = .799723
 - **#2000 features, stopwords, strip accents, mindf=2, ngram=(1,2) = .80049 **best for overall model performance**
 
 **for 2000 - test against all the models, for the other two test against Logistic Regression and the top-performing black box model(GradientBoost)


In [20]:
best_cv = CountVectorizer(max_features=2000, ngram_range=(1,2), stop_words = 'english', min_df=2)
best_cv.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=2000, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
best = pd.DataFrame(best_cv.transform(X).todense(),
                   columns = best_cv.get_feature_names())
best.to_csv('./best_fit_cv.csv')

In [89]:
from sklearn.feature_extraction.text import CountVectorizer

In [93]:
X = df.posts

In [128]:
#3000 features, stopwords and strip accents = .796034
cv3 = CountVectorizer(max_features=3000, stop_words='english')
cv3.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [129]:
cv3 = pd.DataFrame(cv3.transform(X).todense(),
                   columns = cv3.get_feature_names())
cv3.shape

(8675, 3000)

In [130]:
cv3['target'] = df['type']
cv3['i_e'] = df.type.apply(lambda x: x[0])
cv3['s_n'] = df.type.apply(lambda x: x[1])
cv3['t_f'] = df.type.apply(lambda x: x[2])
cv3['j_p'] = df.type.apply(lambda x: x[3])

In [131]:
cv3['i_e'] = cv3['i_e'].apply(lambda x: 1 if x == 'E' else 0)
cv3['s_n'] = cv3.s_n.apply(lambda x: 1 if x == 'N' else 0)
cv3['t_f'] = cv3.t_f.apply(lambda x: 1 if x == 'F' else 0)
cv3['j_p'] = cv3.j_p.apply(lambda x: 1 if x == 'P' else 0)

In [132]:
cv3.to_csv('./countvectorized posts/cv3000.csv')

In [103]:
#2500 features, stopwords, strip accents, mindf=2, ngram=(1,2) = .799723

In [104]:
cv2500 = CountVectorizer(max_features=2500, strip_accents='unicode', stop_words='english', ngram_range=(1,2))
cv2500.fit(X)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=2500, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents='unicode', token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [105]:
cv2500 = pd.DataFrame(cv2500.transform(X).todense(),
                   columns = cv2500.get_feature_names())
cv2500.to_csv('./countvectorized posts/cv2500.csv')

# 4. Run the models against various CountVectorizers
- **best performers of overall models were: X1000(91), Xbest (90), cv3000(89.7), 500 (89.4) stop(89.4)**
- **best performers Log(interpretable): stop (89), cv3000(88), X500(87.96)**

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [25]:
y_s_n

0       1
1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      0
28      1
29      1
       ..
8645    1
8646    1
8647    1
8648    1
8649    1
8650    1
8651    0
8652    0
8653    1
8654    0
8655    1
8656    1
8657    1
8658    0
8659    1
8660    1
8661    1
8662    1
8663    1
8664    1
8665    1
8666    1
8667    1
8668    1
8669    1
8670    0
8671    1
8672    1
8673    1
8674    1
Name: s_n, Length: 8675, dtype: int64

In [26]:
X1000.head()

Unnamed: 0,10,100,12,15,16,20,30,50,ability,able,...,yesterday,young,younger,youtu,youtube,target,i_e,s_n,t_f,j_p
0,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,16,INFJ,0,1,1,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,ENTP,1,1,0,1
2,1,0,0,0,0,0,1,0,2,1,...,0,0,0,0,3,INTP,0,1,0,1
3,0,0,0,0,0,0,0,1,0,2,...,0,0,0,0,2,INTJ,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,ENTJ,1,1,0,0


In [None]:
#baseline for iNtuition vs. sensing

print "Baseline iNtuition/Sensing", (df.s_n.value_counts()[1])/float(len(df.s_n))

In [30]:
# Prepare classification models for iNtuition vs. sensing
X = X1000.drop(X1000[['target','i_e','s_n','t_f','j_p']], axis=1)
y_s_n = X1000['s_n']
X1000_train, X1000_test, y1000_n_train, y1000_n_test = train_test_split(X,y_s_n)

models = []
models.append(('LOR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFRST', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('Gauss', GaussianNB()))
models.append(('Multi', MultinomialNB()))
models.append(('Bern',BernoulliNB()))

# Evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = StratifiedKFold()
    cv_results = cross_val_score(model, X1000_train, y1000_n_train, cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LOR: 0.832001 (0.003022)
KNN: 0.850293 (0.002270)
CART: 0.824624 (0.006818)
RFRST: 0.863817 (0.003528)
GB: 0.889487 (0.003018)
ADA: 0.878727 (0.000639)
Gauss: 0.781738 (0.012289)
Multi: 0.843685 (0.012919)
Bern: 0.805102 (0.003587)


In [36]:
GB = GradientBoostingClassifier()

In [37]:
#best score: GB, logreg = LESS THAN baseline (can't use for)
GB.fit(X1000_train,y1000_n_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [38]:
GB.score(X1000_test,y1000_n_test)

0.9147072383586906

In [31]:
Xbest_train, Xbest_test, ybest_n_train, ybest_n_test = train_test_split(best,y_s_n)

models = []
models.append(('LOR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFRST', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('Gauss', GaussianNB()))
models.append(('Multi', MultinomialNB()))
models.append(('Bern',BernoulliNB()))

# Evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = StratifiedKFold()
    cv_results = cross_val_score(model, Xbest_train, ybest_n_train, cv=kfold, scoring=scoring)
    #cv500_results = cross_val_score(model, X500_train, y500_e_train, cv=kfold, scoring=scoring)
    #cv1000_results = cross_val_score(model, X1000_train, y1000_e_train, cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LOR: 0.864278 (0.003191)
KNN: 0.844296 (0.002885)
CART: 0.835532 (0.012307)
RFRST: 0.862897 (0.004685)
GB: 0.893944 (0.001370)
ADA: 0.886105 (0.001394)
Gauss: 0.785427 (0.010447)
Multi: 0.819088 (0.004774)
Bern: 0.806021 (0.011238)


In [39]:
GB.fit(Xbest_train,ybest_n_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [40]:
GB.score(Xbest_test,ybest_n_test)

0.9031811894882434

In [112]:
LOR.fit(Xbest_train,ybest_n_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [113]:
LOR.score(Xbest_test,ybest_n_test)

0.8621484555094514

In [32]:
X500_train, X500_test, y500_n_train, y500_n_test = train_test_split(X500s,y_s_n)
models = []
models.append(('LOR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFRST', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('Gauss', GaussianNB()))
models.append(('Multi', MultinomialNB()))
models.append(('Bern',BernoulliNB()))

# Evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = StratifiedKFold()
    cv_results = cross_val_score(model, X500_train, y500_n_train, cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LOR: 0.877807 (0.008858)
KNN: 0.860744 (0.002120)
CART: 0.834923 (0.010082)
RFRST: 0.877498 (0.002378)
GB: 0.895634 (0.002663)
ADA: 0.885183 (0.001984)
Gauss: 0.798494 (0.007143)
Multi: 0.828621 (0.005240)
Bern: 0.806947 (0.008728)


In [41]:
GB.fit(X500_train,y500_n_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [42]:
GB.score(X500_test,y500_n_test)

0.8944213923467036

In [44]:
LOR = LogisticRegression()

In [45]:
#testing LogReg
LOR.fit(X500_train,y500_n_train)
LOR.score(X500_test,y500_n_test)

0.8796680497925311

### Test GB & LOG on 3000 & 2500 (just to see if it's better)

In [46]:
Xstop_train, Xstop_test, ystop_n_train, ystop_n_test = train_test_split(stop,y_s_n)

models = []
models.append(('LOR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFRST', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('Gauss', GaussianNB()))
models.append(('Multi', MultinomialNB()))
models.append(('Bern',BernoulliNB()))

# Evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = StratifiedKFold()
    cv_results = cross_val_score(model, Xstop_train, ystop_n_train, cv=kfold, scoring=scoring)
    
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LOR: 0.885952 (0.001914)


KeyboardInterrupt: 

In [47]:
#testingLogReg
LOR.fit(Xstop_train,ystop_n_train)
LOR.score(Xstop_test,ystop_n_test)

0.8879668049792531

In [48]:
#testingGB
GB.fit(Xstop_train,ystop_n_train)
GB.score(Xstop_test,ystop_n_test)

0.8944213923467036

In [108]:
X3000_train, X3000_test, y3000_n_train, y3000_n_test = train_test_split(cv3,y_s_n)
LOR.fit(X3000_train,y3000_n_train)
LOR.score(X3000_test,y3000_n_test)

0.88427846934071

In [109]:
GB.fit(X3000_train,y3000_n_train)
GB.score(X3000_test,y3000_n_test)

0.8971876440756109

In [110]:
X2500_train, X2500_test, y2500_n_train, y2500_n_test = train_test_split(cv2500,y_s_n)
LOR.fit(X2500_train,y2500_n_train)
LOR.score(X2500_test,y2500_n_test)

0.8653757491931766

In [111]:
GB.fit(X2500_train,y3000_n_train)
GB.score(X2500_test,y3000_n_test)

0.8662978331028124

# 5. Summary of Scores:
(measured by Sensing vs. iNtution b/c it had the highest baseline = 86)

### BEST OVERALL SCORE (GradientBoost):
- X1000: .91
- best: .90
- X3000: .897

### BEST INTERPRETABLE SCORE (LogisticRegression):
- stop: .8879
- X3000: .884
- X500: .87967

In [114]:
#moving forward I will use the X1000 to predict personality given text, and will use
#stop & X500 to see the best words (which gives me the best word)