### This script only contains the code to train a model to predict (Extraversion or Intraversion) for an individual. 

### Similar code with slight changes can be used to predict (Sensing or Intuition), (Thinking or Feeling ) and (Judging or Perceiving)

### This script was as a result of a collaboration with two other individuals as well

In [None]:
# Data Analysis
import pandas as pd
import numpy as np
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt
import pickle as pkl
from scipy import sparse

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt


#Plotly
import plotly.express as px
import plotly.graph_objects as go

# Text Processing
import re
import itertools
import string
import collections
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy 

# Machine Learning packages
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn.cluster as cluster
from sklearn.manifold import TSNE

# Model training and evaluation
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


#Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import classification_report

# Ignore noise warning
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading the dataset

df = pd.read_csv('../input/original-mbti-data/mbti_1.csv')
print(df.shape, df.columns.to_list(),'\n')
df.head()

In [None]:
#To install if necessary

#conda install -c conda-forge wordcloud=1.6.0 
#conda install -c conda-forge xgboost
#pip install xgboost

#### Exploratory Analysis 

In [None]:
#Check for any null values

df.isnull().any()

In [None]:
#Size of the dataset

nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df.dtypes

In [None]:
df.info()

* There are only 2 columns in the dataset
* Total no. of rows are 8675
* There are no null values present in the dataset
* One Disadvantage is that all values are textual, hence they have to be converted to numerical form to train the ML model

In [None]:
df.describe(include=['object'])

* There are 16 unique personality type indicators in the dataset
* INFP is the most frequently occuring personality type in our dataset (no. of occurences is 1832)

* Lastly, there are no repeating posts in the dataset

In [None]:
types = np.unique(np.array(df['ptype']))
types

In [None]:
total = df.groupby(['ptype']).count()
total

In [None]:
fig = px.histogram(df, x="ptype",y="posts",histfunc = "count",
                   title='Total posts for each personality type',
                   labels={'ptype':'Personality types','posts':'No. of posts available'}, # can specify one label per df column
                   opacity=0.8,
                   color_discrete_sequence=['navy'] # color of histogram bars
                   )
fig.show()

In [None]:
px.histogram(df, x="ptype",y="posts",histfunc = "count", color="ptype", 
             title='Total posts for each personality type',
             labels={'ptype':'Personality types','posts':'of posts available'}, 
             color_discrete_sequence=px.colors.sequential.YlGnBu).update_xaxes(categoryorder="total descending")

In [None]:
#this function counts the no of words in each post of a user
def var_row(row):
    l = []
    for i in row.split('|||'):
        l.append(len(i.split()))
    return np.var(l)

#this function counts the no of words per post out of the total 50 posts in the whole row
df['words_per_comment'] = df['posts'].apply(lambda x: len(x.split())/50)
df['variance_of_word_counts'] = df['posts'].apply(lambda x: var_row(x))

fig = px.strip(df, x='ptype', y='words_per_comment',hover_data=["variance_of_word_counts"],
              color="ptype",color_discrete_sequence=px.colors.sequential.Plasma_r)

# type 3 : boxplot with stripplot + color
#fig = px.box(df, x='type', y='words_per_comment', color='type', points="all",
#             color_discrete_sequence=px.colors.sequential.Plasma_r).update_xaxes(categoryorder="total descending")


fig.update_layout(
    hoverlabel=dict(
        bgcolor="blue",
        font_size=12,
        font_family="Rockwell"
    )
)
fig.show()


This plot further shows clearly that there are a number of imbalances in our dataset, showing all the observations along with some representation of the underlying distribution using our added features.

INFP has the most cluttered showing there are most number of comments of this type of personality.

In [None]:
plt.figure(figsize=(30,25))
sns.set(style="white", color_codes=True) # suitable theme for jointplot
sns.jointplot("variance_of_word_counts", "words_per_comment", data=df, alpha=0.7)
plt.show()

In [None]:
fig = px.density_heatmap(df, x="variance_of_word_counts", y="words_per_comment", marginal_x="box", marginal_y="violin")
fig.show()

* The 2 histogram plots represent Gaussian distribution of a sample space, which in our case comprises of no. of words per comment and associated variance of word counts from our dataset.
* In the hexagonal plot, the hexagon with most number of points gets darker color. So if you look at the above plot, you can see that most of the posts have words between 100 and 150 and most of no. of words per comment by a user is between 25-30.
* We can see that there is no correlation observed between variance of word count and the words per comment.
* There is a strong relationship when there are 25-30 words per comment & the variance of word counts is 100-150
* This is also visible by analyzing the histogram plots on both the axis.

In [None]:
def plot_jointplot(mbti_type, axs, titles):
    df_1 = df[df['ptype'] == mbti_type]
    sns.jointplot("variance_of_word_counts", "words_per_comment", data=df_1, kind="hex", ax = axs, title = titles)

plt.figure(figsize=(24, 5))    
i = df['ptype'].unique()
k = 0

for m in range(1,3):
  for n in range(1,7):
    df_1 = df[df['ptype'] == i[k]]
    sns.jointplot("variance_of_word_counts", "words_per_comment", data=df_1, kind="hex" )
    plt.title(i[k])
    k+=1
plt.show()

In [None]:
df["length_posts"] = df["posts"].apply(len)

sns.distplot(df["length_posts"]).set_title("Distribution of Lengths of all 50 Posts")

In [None]:
fig = px.histogram(df, x="length_posts", hover_data=df.columns, barmode="overlay",
                  title='Length of posts')
fig.show()

In [None]:
#If you need to install tabulate

#!pip install tabulate

In [None]:
#Finding the most common words in all posts.
words = list(df["posts"].apply(lambda x: x.split()))
words = [x for y in words for x in y]
#print(Counter(words).most_common(40))

from tabulate import tabulate
print(tabulate(Counter(words).most_common(40), headers=['Word', 'Frequency']))

In [None]:
# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(width=1200, height=500, background_color="white").generate(" ".join(words))
# collocations to False  is set to ensure that the word cloud doesn't appear as if it contains any duplicate words
plt.figure(figsize=(25,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
fig, ax = plt.subplots(len(df['type'].unique()), sharex=True, figsize=(15,len(df['type'].unique())))
k = 0
for i in df['type'].unique():
    df_4 = df[df['type'] == i]
    wordcloud = WordCloud(max_words=1628,relative_scaling=1,background_color="white",normalize_plurals=False).generate(df_4['posts'].to_string())
    plt.subplot(4,4,k+1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(i)
    ax[k].axis("off")
    k+=1

* we can see there are a no. of irrelevant words present in the dataset (e.g. ha, ar, Ti etx.) which will need to be removed
* Interestingly, among the most common words in the word clouds of individual personality types, is the names of MBTI personlity types themselves.

It would hence be necessary to clean our posts by removing these MBTI words from each of them as part of our pre-processing stage, before training the model for better evaluation results.

#### Counting the no. of users and posts in the given MBTI Kaggle dataset

In [None]:
def extract(posts, new_posts):
    for post in posts[1].split("|||"):
        new_posts.append((posts[0], post))

posts = []
df.apply(lambda x: extract(x, posts), axis=1)
print("Number of users", len(df))
print("Number of posts", len(posts))
#print("5 posts from start are:")
#posts[0:5]

* It is inferenced that a lot of hyperlinks are presnt in these posts
* It is safe to assume that url links do not provide any real information about a user's personality, hence, we need to clean our dataset for these too.

This given sample dataset does not come from the entire Kaggle user population; rather, it comes from Kaggle users who leave comments; thus, our ML model's conclusion cannot be applied to all Kaggle users, only to those who leave comments.

Furthermore, with more data, more accurate models could be obtained. As a result, the model may fail to classify a personality at the lower end.

### Pre-Processing Stage

In [None]:
def preprocess_text(df, remove_special=True):
    texts = df['posts'].copy()
    labels = df['ptype'].copy()

    #Remove links 
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', x.replace("|"," ") + " "))
    
    #Keep the End Of Sentence characters
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'\.', ' EOSTokenDot ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'\?', ' EOSTokenQuest ', x + " "))
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'!', ' EOSTokenExs ', x + " "))
    
    #Strip Punctation
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[\.+]', ".",x))

    #Remove multiple fullstops
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^\w\s]','',x))

    #Remove Non-words
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))

    #Convert posts to lowercase
    df["posts"] = df["posts"].apply(lambda x: x.lower())

    #Remove multiple letter repeating words
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'([a-z])\1{2,}[\s|\w]*','',x)) 

    #Remove very short or long words
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{0,3})?\b','',x)) 
    df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{30,1000})?\b','',x))

    #Remove MBTI Personality Words - crutial in order to get valid model accuracy estimation for unseen data. 
    if remove_special:
        pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
        pers_types = [p.lower() for p in pers_types]
        p = re.compile("(" + "|".join(pers_types) + ")")
        
        df["posts"] = df["posts"].apply(lambda x: p.sub('',x))
    
    return df

#Preprocessing of entered Text
new_df = preprocess_text(df,remove_special=True)

In [None]:
new_df

In [None]:
new_df = new_df.drop(['words_per_comment','variance_of_word_counts', 'length_posts'],axis=1) 

In [None]:
new_df

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [None]:
new_df['posts'] = new_df.posts.apply(lemmatize_text)

In [None]:
new_df

In [None]:
#data = data.drop(columns=['words_per_comment','variance_of_word_counts','length_posts'])

In [None]:
new_df

### Feature Engineering

In [None]:
# Converting MBTI personality (or target or Y feature) into numerical form using Label Encoding
# encoding personality type
# Don't think this column is necessary

enc = LabelEncoder()
new_df['type of encoding'] = enc.fit_transform(new_df['ptype'])

target = new_df['type of encoding'] 

In [None]:
new_df.head(16)

### Four Classifiers across MBTI axis

In [None]:
def get_types(row):
    t=row['ptype']

    I = 0; N = 0
    T = 0; J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('I-E not found') 
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('N-S not found')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('T-F not found')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('J-P not found')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

data = new_df.join(new_df.apply (lambda row: get_types (row),axis=1))
data.head(15)

In [None]:
data.head(10)

* Using the above code, if a person has I, N, T and J, the value across the 4 axis of MBTI i.e. IE, NS, TF and JP respectively, will be 1. Else 0.

This will help us calculate for e.g. how many Introvert posts are present v/s how many Extrovert posts are presnt, out of all the given entries in our labelled Kaggle dataset. This is done in order to extplore the dataset for all the individual Personality Indices of MBTI

###### Counting No. of posts in one class / Total no. of posts in the other class

In [None]:
print ("Introversion (I) /  Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
print ("Intuition (N) / Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
print ("Thinking (T) / Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
print ("Judging (J) / Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])

* We infer that there is unequal distribution even among each of the 4 axis in the entries of out dataset. i.e. out of IE:E is the majority, in NS:S is the majority. While TF and JP have realtively less differnce between them.

In [None]:
#Plotting the distribution of each personality type indicator
N = 4
bottom = (data['IE'].value_counts()[0], data['NS'].value_counts()[0], data['TF'].value_counts()[0], data['JP'].value_counts()[0])
top = (data['IE'].value_counts()[1], data['NS'].value_counts()[1], data['TF'].value_counts()[1], data['JP'].value_counts()[1])

ind = np.arange(N)    # the x locations for the groups
# the width of the bars
width = 0.7           # or len(x) can also be used here

p1 = plt.bar(ind, bottom, width, label="I, N, T, F")
p2 = plt.bar(ind, top, width, bottom=bottom, label="E, S, F, P") 

plt.title('Distribution accoss types indicators')
plt.ylabel('Count')
plt.xticks(ind, ('I / E',  'N / S', 'T / F', 'J / P',))
plt.legend()

plt.show()

In [None]:
axis = data[["IE", "NS","TF","JP"]].plot(kind="bar", stacked=True)

fig = axis.get_figure()

Fun Fact : The above results match with real life findings by researchers across various personality and psycological studies like

We can compare this with the fact that Introverts are a minority, making up roughly 16 percent of people [1]. Eventhough among introverts, there are varying degrees, and Carl Jung said, “There is no such thing as a pure Extrovert or a pure introvert" Hence it is tricky to classify a person with 1 type.

While the population is split roughly 50/50 on the other dimensions, a full 70% of people show a preference for Sensing over Intuition when taking a personality test. Because Intuitives are the minority, the onus is on them to adjust to the Sensor way of thinking.

The differences between Judging and Perceiving are probably the most marked differences of all the four preferences. People with strong Judging preferences might have a hard time accepting people with strong Perceiving preferences, and vice-versa. On the other hand, a "mixed" couple (one Perceiving and one Judging) can complement each other very well, if they have developed themselves enough to be able to accept each other's differences.

##### Features Correlation Analysis

In [None]:
data[['IE','NS','TF','JP']].corr()

In [None]:
#Stem the posts

from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
data['posts'] = data['posts'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [None]:
data.head(15)

In [None]:
#Remove stopwords from the posts

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

data['posts'] = data['posts'].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [None]:
data.head(15)

In [None]:
#Put the posts alone in a variable
PostsAlone = data['posts']

In [None]:
PostsAlone

In [None]:
#Converting the posts into text features using sklearn’s TF-IDF
#Might want to change the min_df and ngram_range
#Fit first then transform
#We will train our models on these transformations

from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(min_df=0.2, ngram_range=(1,3))
vec.fit(PostsAlone)
features = vec.transform(PostsAlone)

In [None]:
print(features)

In [None]:
#Train-test split for the logisitc regression model

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, data['IE'], 
                                                    train_size=0.8, random_state=1)
print('training data:', X_train.shape)

print('test data:', X_test.shape)

In [None]:
#Train a logistic regression model to predict whether someone is a Judging or Perceiving

log_reg = LogisticRegression(solver='lbfgs', max_iter=3000)

# fit the model to the training data
clf = log_reg.fit(X_train, y_train)

In [None]:
#Accuracies of the model

print('\ntraining accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)))

In [None]:
# Regularization w/ the C parameter (note: default C = 1)
# Best c's for test accuracy appear to be 0.001, 0.01, 0.1

cset = [.001, .01, .1, 1, 10]
for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver='lbfgs', max_iter=1000, C=i)
    clf = log_reg.fit(X_train, y_train)
    print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
    print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)), '\n')

In [None]:
#Cross-validation w/ tuning regularization in logistic regression

for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver='lbfgs', max_iter=1000, C=i)
    scores = cross_val_score(log_reg, features, data['IE'], cv=5)
    print(scores)
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2), '\n')

In [None]:
#Logistic Regression with Grid search
#Had to use MaxAbScaler because I got an error w/MinMaxScaler. Not sure why
#Best parameter here is "C:0.1"

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler


# define x_scaler
scaler = MaxAbsScaler()

scaled_X = scaler.fit_transform(X_train)

tuned_parameters = {'C': [0.1, 0.5, 1, 5, 10, 50, 100]}

grid = GridSearchCV(LogisticRegression(solver='liblinear'), tuned_parameters, cv=3, scoring="accuracy")

grid.fit(scaled_X, y_train)

print('mean of accuracies:', grid.cv_results_['mean_test_score'])
print('std dev of accuracies:', grid.cv_results_['std_test_score'])

# print best parameter after tuning 
print('best parameters:', grid.best_params_) 

# store the best estimator
best_logreg = grid.best_estimator_

In [None]:
#SVM model
#Didn't run this because it would take too long

svm = SVC(kernel = 'linear')

#fit the model to the training data
clf2 = svm.fit(X_train, y_train)

# get accuracy stats
print('training accuracy: {}'.format(clf2.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf2.score(X_test, y_test).round(3)))

In [None]:
#Grid Search with SVM Model
from sklearn.pipeline import Pipeline

# add in a pipeline to control data leakage
steps = [('scaler', MaxAbsScaler()), ('SVM', SVC())]

# define the pipeline object
pipeline = Pipeline(steps)

params = {'SVM__kernel': ['linear'], 'SVM__C': [0.1, 1, 10, 100]} 

# run grid search
grid = GridSearchCV(pipeline, param_grid=params, cv=3)
grid.fit(X_train, y_train)

# print mean and standard deviation of scores by iteration
print('mean of accuracies:', grid.cv_results_['mean_test_score'])
print('std dev of accuracies:', grid.cv_results_['std_test_score'])

# print best parameter after tuning 
print(grid.best_params_) 

# store the best estimator
best_svm = grid.best_estimator_

In [None]:
#Random Forests with grid search

#set-up grid of parameters to search
param_grid = {'n_estimators': [10, 100, 250], 'max_samples': [.25, .5, 1]} 

# instantiate grid search object
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv = 3)

# fitting the model for grid search 
grid.fit(X_train, y_train)

# print parameters, mean, and standard deviation of scores by iteration
for z in range(0, len(grid.cv_results_['params'])):
    print('\nparams:', grid.cv_results_['params'][z])
    print('mean of accuracies:', grid.cv_results_['mean_test_score'][z])
    print('std dev of accuracies:', grid.cv_results_['std_test_score'][z])

# print best parameter after tuning 
print('best parameters:', grid.best_params_)
print('best score:', grid.best_score_)

# store the best estimator
best_rf = grid.best_estimator_

In [None]:
#Logistic Regression: 60.7 accuracy
#SVM: Around 60.2 accuracy
#Random Forests: 60.49

#Logistic the Regression is the best but not by much at all

In [None]:
#Bringing in Trump's tweets

mydata = pd.read_csv('../input/trump-tweets/Trump Tweets1.csv')
print(mydata.shape, mydata.columns.to_list(),'\n')
mydata

In [None]:
#Pre-process the new dataset

def preprocess_text2(df, remove_special=True):

    #Remove links 
    mydata["tweets"] = mydata["tweets"].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', x.replace("|"," ") + " "))
    
    #Keep the End Of Sentence characters
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'\.', ' EOSTokenDot ', x + " "))
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'\?', ' EOSTokenQuest ', x + " "))
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'!', ' EOSTokenExs ', x + " "))
    
    #Strip Punctation
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'[\.+]', ".",x))

    #Remove multiple fullstops
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'[^\w\s]','',x))

    #Remove Non-words
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))

    #Convert posts to lowercase
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: x.lower())

    #Remove multiple letter repeating words
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'([a-z])\1{2,}[\s|\w]*','',x)) 

    #Remove very short or long words
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'(\b\w{0,3})?\b','',x)) 
    mydata["tweets"] =  mydata["tweets"].apply(lambda x: re.sub(r'(\b\w{30,1000})?\b','',x))

    #Remove MBTI Personality Words - crutial in order to get valid model accuracy estimation for unseen data. 
    if remove_special:
        pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
        pers_types = [p.lower() for p in pers_types]
        p = re.compile("(" + "|".join(pers_types) + ")")
        
        mydata["tweets"] = mydata["tweets"].apply(lambda x: p.sub('',x))
    
    return df

#Preprocessing of entered Text
df8 = preprocess_text2(mydata,remove_special=True)

In [None]:
df8

In [None]:
#Lemmatize the new data

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [None]:
#Lemmatize the new data

df8['tweets'] = df8.tweets.apply(lemmatize_text)

In [None]:
#Stem the new data

from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
#Stem the new data

df8['tweets'] = df8['tweets'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [None]:
#Remove stopwords from the new data

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df8['tweets']= df8['tweets'].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [None]:
df8

In [None]:
#Put the tweets in a list

List = df8['tweets'].to_list()

In [None]:
List

In [None]:
#Join this information

Joining = ' '.join(List)

In [None]:
Joining

In [None]:
# Change this join back to a list

Joining2 = [Joining]

In [None]:
#Vectorize and fit all of the tweets combined
features2 = vec.transform(Joining2)

In [None]:
print(features2)

In [None]:
# Predict these features using our old logistic regression model (stored as clf) 
# This is for predicting [I] or [E] or [1] or [0]

pred = clf.predict(features2)

In [None]:
print(pred)


#Trump's personality trait is ESFP, Here, our model predicts that he's 1 or I.

In [None]:
#Probability distribution of whether he's 0 or 1/E or I

pred2 = clf.predict_proba(features2)

In [None]:
#Probability distribution of whether he's 0 or 1/E or I

pred2