In [1]:
import pandas as pd

In [2]:
# reading data in csv file using pandas
df = pd.read_csv('filtered_tweets.csv')

In [3]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
1,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
2,0,1467813579,Mon Apr 06 22:20:31 PDT 2009,NO_QUERY,starkissed,@LettyA ahh ive always wanted to see rent lov...
3,0,1467818603,Mon Apr 06 22:21:49 PDT 2009,NO_QUERY,kennypham,"Sad, sad, sad. I don't know why but I hate thi..."
4,0,1467819650,Mon Apr 06 22:22:05 PDT 2009,NO_QUERY,antzpantz,@Viennah Yay! I'm happy for you with your job!...


In [4]:
# Remove mentions, emails and links
def removeAt(tweet):
    words = tweet.split(' ')
    return ' '.join(filter(lambda x: x and '@' not in x, words))

def removeLinks(tweet):
    words = tweet.split(' ')
    return ' '.join(filter(lambda x: not ('http://' in x.lower() or 'https://' in x.lower()), words))

In [5]:
# Apply formulas to clean data
df.text = df.text.apply(lambda x: removeAt(removeLinks(x.strip())))

In [6]:
# Cleaned dataframe 
df.drop_duplicates(subset='text', keep=False, inplace=True)

# sort dataframe by user 
df.sort_values(by='user', inplace=True)
df.head()

Unnamed: 0,target,id,date,flag,user,text
14524,0,1835705341,Mon May 18 06:34:45 PDT 2009,NO_QUERY,12gaBrowningGal,Blind. Definitely. They aren't the smartest bi...
68041,0,2211076285,Wed Jun 17 12:24:07 PDT 2009,NO_QUERY,12gaBrowningGal,"I'm killing me too! I'm hungry now- Pie, Outdo..."
85046,0,2299986642,Tue Jun 23 13:35:33 PDT 2009,NO_QUERY,12gaBrowningGal,My skeet game bites! I am not joking. I will h...
203184,4,2071364168,Sun Jun 07 19:16:31 PDT 2009,NO_QUERY,12gaBrowningGal,That's just because she trying to figure out w...
207475,4,2178187758,Mon Jun 15 07:19:40 PDT 2009,NO_QUERY,12gaBrowningGal,I am really not skilled enough to shoot trap. ...


In [7]:
# Create a dictionary consolidating users and tweets to send to LIWC 
user_list = df['user'].tolist()
tweets_list = df['text'].tolist()

user_tweet_dic = {}

for i in range(len(user_list)):
    user = user_list[i]
    tweet = tweets_list[i]
    
    if user not in user_tweet_dic:
        user_tweet_dic[user] = []
        user_tweet_dic[user].append(tweet)
    else:
        user_tweet_dic[user].append(tweet)
        
new_user_list = []
new_combined_tweets_list = []

for user, tweets in user_tweet_dic.items():
    combined_tweets = ' '.join(tweets)
    if len(combined_tweets.split(' ')) >= 100:
        new_user_list.append(user)
        new_combined_tweets_list.append(combined_tweets)
    
# Create a dataframe based on dictionary and exporting as a csv file
consolidated_tweets_df = pd.DataFrame({'user': new_user_list, 'tweets': new_combined_tweets_list})
consolidated_tweets_df.to_csv('consolidated_tweets_df.csv')

<div class='alert alert-info'>
    Imagine the group taking that csv file and putting it through the LIWC software to get the output! :D
    <br>
    The output will be saved in a file called LIWC2015 Results (consolidated_tweets_df.csv).csv that will be imported later
</div>

In [8]:
# Creating a dataframe for the first 500 users 
first_500_users = consolidated_tweets_df['user'].unique()[:500]
first_500_users_df = df[df['user'].isin(first_500_users)]

# Creating a dictionary of user and their tweets 
user_tweet_dictionary = {}

# Create a format that follows IBM Watson's format and saving it to the user_tweet_dictionary
for index, row in first_500_users_df.iterrows():
    user = row['user']
    text = row['text']
    
    tweet = {}
    
    tweet['content'] = text
    tweet['contenttype'] = 'text/plain'
    tweet['language'] = 'en'
    
    if user not in user_tweet_dictionary:
        user_tweet_dictionary[user] = {'contentItems': []}
        user_tweet_dictionary[user]['contentItems'].append(tweet)
    else:
        user_tweet_dictionary[user]['contentItems'].append(tweet)

<div class='alert alert-info'>
    The bottom few cells are commented out because we are using API calls to get the output, and after a certain number of calls, we will have to pay money!
    <br>
    Instead, we have saved the file called big5_df.csv that will also be imported later
</div>

In [38]:
# import json
# from ibm_watson import PersonalityInsightsV3
# from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
# from credentials import IBM_KEY

# #API KEY provided on the service page 
# KEY = IBM_KEY

# # Authentication via IBM's IAM (Identity and Access Management)
# authenticator = IAMAuthenticator(KEY)

# # Creating a service instance
# service = PersonalityInsightsV3(
#     version='2017-10-13',
#     authenticator=authenticator)

# # Setting service endpoint 
# service.set_service_url('https://gateway.watsonplatform.net/personality-insights/api')

In [10]:
# # Creating a dictionary to store results from IBM Watson
# results = {'User': [],
#           'Openness': [],
#           'Conscientiousness': [],
#           'Extraversion': [],
#           'Agreeableness': [],
#           'Emotional range': []}

# # creates profiles of users in user_tweet_dictionary and saves them to results
# for user, tweets in user_tweet_dictionary.items():
#     profile = service.profile(tweets, 'application/json', raw_scores=True, consumption_preferences=True).get_result()
    
#     results['User'].append(user)
#     results['Openness'].append(profile['personality'][0]['raw_score'])
#     results['Conscientiousness'].append(profile['personality'][1]['raw_score'])
#     results['Extraversion'].append(profile['personality'][2]['raw_score'])
#     results['Agreeableness'].append(profile['personality'][3]['raw_score'])
#     results['Emotional range'].append(profile['personality'][4]['raw_score'])

In [11]:
# # Create a dataframe from results dictionary and save csv to use
# big5_df = pd.DataFrame(results)
# big5_df.to_csv('big5_df.csv')

In [12]:
# Importing both the outputs of LIWC and IBM Watson (Big 5)
LIWC_df = pd.read_csv('LIWC2015 Results (consolidated_tweets_df.csv).csv')
LIWC_df.drop(0, inplace=True)
LIWC_df.drop(columns=['A', 'C'], inplace=True)
LIWC_df.rename(columns={'B': 'User'}, inplace=True)
big5_df = pd.read_csv('big5_df.csv').drop(columns=['Unnamed: 0'])

# Merge dataframes together
updated_df = pd.merge(big5_df, LIWC_df)
updated_df.head()

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,WC,Analytic,Clout,Authentic,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,12gaBrowningGal,0.764999,0.656144,0.548956,0.788804,0.439579,388,27.36,34.94,68.96,...,2.58,0.0,0.0,0.52,12.37,0.52,0.0,3.61,0.0,0.0
1,15Stepz,0.748124,0.690716,0.564859,0.82393,0.41077,254,26.07,70.94,84.8,...,9.45,0.39,0.0,1.18,0.39,1.57,0.0,4.33,1.57,0.0
2,16_MileyCyrus,0.672814,0.637773,0.525612,0.811025,0.563704,1164,21.04,64.42,58.8,...,3.26,0.52,0.0,2.15,13.57,0.43,0.0,1.37,0.26,1.03
3,18percentgrey,0.789412,0.607883,0.528168,0.772518,0.461528,931,47.25,53.87,58.83,...,4.83,0.64,0.75,1.61,2.26,1.93,0.0,3.22,1.29,2.26
4,19fischi75,0.706904,0.627109,0.447471,0.803232,0.518279,1707,48.43,48.83,63.95,...,0.7,0.23,0.0,3.1,1.64,3.87,0.0,0.0,2.58,3.81


In [13]:
def normalise(number):
    return round(number*100,2)

updated_df['Openness'] = updated_df['Openness'].apply(normalise)
updated_df['Conscientiousness'] = updated_df['Conscientiousness'].apply(normalise)
updated_df['Extraversion'] = updated_df['Extraversion'].apply(normalise)
updated_df['Agreeableness'] = updated_df['Agreeableness'].apply(normalise)
updated_df['Emotional range'] = updated_df['Emotional range'].apply(normalise)

In [14]:
def readability_score(row):
    sixletter = row['Sixltr']
    wps = row['WPS'] 
    wordcount = row['WC']
    
    readability = (-1 * sixletter) + wps - wordcount + 3
    return readability 

updated_df['readability'] = updated_df.apply(readability_score, axis=1)

In [15]:
def readability_percentile(score):
    readability_range = updated_df['readability'].max() - updated_df['readability'].min()
    return -score/readability_range*100

updated_df['readability'] = updated_df['readability'].apply(readability_percentile)

In [16]:
def composite_score(row):    
    #intermediate numbers
    sixletter = row['Sixltr']
    articles = row['article']
    pasttense = row['focuspast']
    presenttense = row['focuspresent']
    firstperson = 100 - row['i'] #inverse of first person singular pronoun
    discrepancy = row['discrep']
    wps = row['WPS'] #words per sentence
    wordcount = row['WC']
    
    #features
    #Distancing (0.19) - six letter words, articles, past tense, and the inverse of first person singular pronouns, present tense and discrepancy words
    
    Distancing = (sixletter + articles + pasttense + firstperson + presenttense + discrepancy)/6 #average of 6 intermediate numbers
    
    #Readability (-0.32) - multiplying negative one by six letter words, adding words per sentence, subtracting the amount of words recognized by the LIWC dictionary, and adding three
#     Readability = (-1 * sixletter) + wps - wordcount + 3
    Readability = row['readability']
    
    #Swear (0.31) 
    Swear = row['swear']
    
    #Anger (0.22)
    Anger = row['anger']
    
    #Agreeability (-0.43)
    Agreeability = row['Agreeableness']
        
    #Neuroticism (0.3)
    Neuroticism = row['Emotional range']
    
    #calculate composite score
    Total = 0.19 + 0.32 + 0.31 + 0.22 + 0.3 + 0.43
    Psychopathy_Score = 0.19/Total * Distancing - 0.32/Total * Readability + 0.31/Total * Swear + 0.22/Total * Anger - 0.43/Total * Agreeability + 0.3/Total * Neuroticism
    
    #write into dictionary
    return Psychopathy_Score

updated_df['composite score'] = updated_df.apply(composite_score, axis=1)

In [17]:
def score_percentile(score):
    max_score = updated_df['composite score'].max()
    min_score = updated_df['composite score'].min()
    score_range = max_score - min_score
    return (score-min_score)/score_range*100

updated_df['composite score percentile'] = updated_df['composite score'].apply(score_percentile)

In [18]:
updated_df = updated_df[['User', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Emotional range', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP', 'readability', 'composite score', 'composite score percentile']]

In [19]:
updated_df = updated_df.sort_values(by='composite score', ascending=False).reset_index(drop=True)
updated_df.head(30)

Unnamed: 0,User,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,WC,Analytic,Clout,Authentic,...,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP,readability,composite score,composite score percentile
0,Abby_ox,64.52,51.86,39.56,66.06,77.37,305,56.42,4.43,97.8,...,0.0,0.66,0.0,0.0,3.28,0.0,2.3,13.664762,-2.891426,100.0
1,BethanyAnn614,67.97,56.48,49.19,72.73,87.82,379,54.79,23.01,73.69,...,0.26,3.43,0.0,0.0,1.06,0.53,1.06,17.784407,-2.944168,99.754655
2,Benjimonicus,68.73,52.83,42.49,63.95,76.37,385,59.27,26.65,78.4,...,1.04,1.04,0.26,0.0,4.68,0.0,0.52,17.613205,-3.044657,99.2872
3,BiGVixXen,68.36,50.65,42.85,72.13,81.86,225,9.74,8.43,99.0,...,0.44,11.56,0.0,0.0,8.0,0.0,0.0,10.407696,-3.155676,98.770759
4,DaniScot,69.67,50.53,41.53,67.4,75.14,289,65.89,9.44,99.0,...,0.35,5.88,0.0,0.0,3.11,0.69,0.69,13.353784,-3.633829,96.546479
5,CoConutShelle,63.47,57.43,47.24,72.9,81.17,289,60.29,18.41,53.02,...,2.08,7.61,0.35,0.0,2.08,0.0,0.0,13.433522,-3.821409,95.673893
6,BonjourHoney,75.07,50.75,43.4,74.96,72.85,171,6.03,59.24,50.35,...,1.75,0.0,0.0,0.0,5.85,0.0,0.0,7.616863,-4.538181,92.339604
7,Bastante_P,69.27,49.89,46.49,68.52,68.92,305,43.39,14.7,80.82,...,0.66,3.93,0.0,0.0,4.92,0.33,0.66,13.180705,-4.573028,92.177502
8,ACsBarbieGirl69,66.75,52.23,44.4,70.72,70.38,306,33.8,13.32,62.28,...,0.98,14.71,0.33,0.0,0.0,0.0,0.98,14.223398,-4.757017,91.321616
9,Beccixo,68.74,55.63,47.5,67.37,65.93,269,83.28,27.61,81.57,...,0.0,0.0,0.37,0.0,0.37,0.37,0.74,10.827494,-4.775903,91.233765


In [20]:
# boy_kill_boy and up are psychopaths
updated_df['psychopath'] = list(range(500))
updated_df['psychopath'][0:22] = 1
updated_df['psychopath'][22:] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [21]:
# replace features by text of user 
updated_df = updated_df[['User','psychopath']]
updated_df = pd.merge(updated_df,consolidated_tweets_df.rename(columns={'user':'User'}),how='left')[['User','tweets','psychopath']]

In [22]:
updated_df.head()

Unnamed: 0,User,tweets,psychopath
0,Abby_ox,wooo!! finished my lamp for Dt #LoveEverybody ...,1
1,BethanyAnn614,ugh. I don't like today.i much prefer tomorrow...,1
2,Benjimonicus,Walking to stow This family guy is OLD Morning...,1
3,BiGVixXen,no I didn't have time! I rushed outta the hous...,1
4,DaniScot,depends on the kind of video.... I'm pretty su...,1


In [23]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from collections import defaultdict
from sklearn import model_selection, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import preprocessor as p

In [24]:
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

word_Lemmatized = WordNetLemmatizer()

def preprocessing(text):
    final_words = []
    # lowercase
    text = p.clean(text)
    text = text.lower()
    # tokenize
    text = word_tokenize(text)
    
    for word, tag in pos_tag(text):
        # remove stopwords and only keep alphabets
        if word not in stopwords.words('english') and word.isalpha():
            # lemmatize words
            word_final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            # append final words to final_words list
            final_words.append(word_final)
            
    # return final string
    return ' '.join(final_words)

In [25]:
updated_df['cleaned_text'] = updated_df['tweets'].apply(preprocessing)

In [26]:
updated_df.head(10)

Unnamed: 0,User,tweets,psychopath,cleaned_text
0,Abby_ox,wooo!! finished my lamp for Dt #LoveEverybody ...,1,wooo finish lamp dt ca believe today really ni...
1,BethanyAnn614,ugh. I don't like today.i much prefer tomorrow...,1,ugh like much prefer tomorrow boo leave work h...
2,Benjimonicus,Walking to stow This family guy is OLD Morning...,1,walk stow family guy old morning get sleep lli...
3,BiGVixXen,no I didn't have time! I rushed outta the hous...,1,time rush outta house hair ruin rain already p...
4,DaniScot,depends on the kind of video.... I'm pretty su...,1,depends kind video pretty sure whole tonight c...
5,CoConutShelle,I want more followers... Follow me please! x i...,1,want follower follow please x rele want lipsy ...
6,BonjourHoney,"No, I just heard about it from someone Who bat...",1,hear someone bath puppy toilet jaw hurt like h...
7,Bastante_P,u forgot to add I'm also confined to a very sm...,1,u forget add also confine small part small as ...
8,ACsBarbieGirl69,ugh that sucks hun it does indeed!!!! See ya h...,1,ugh suck hun indeed see ya hun ill call u poin...
9,Beccixo,i really hate revising it is so boring and dos...,1,really hate revise boring dosnt go ur head bea...


In [27]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    updated_df['cleaned_text'],updated_df['psychopath'],test_size=0.3, random_state=42)

In [28]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(updated_df['cleaned_text'])
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

In [29]:
cv = CountVectorizer(binary=True)
cv.fit(updated_df['cleaned_text'])
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [30]:
# logistic regression & cv
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, lr.predict(X_test)))

Accuracy: 0.9466666666666667




In [31]:
# logistic regression & Tf-idf
lr = LogisticRegression()
lr.fit(Train_X_Tfidf, y_train)
print("Accuracy:", accuracy_score(y_test, lr.predict(Test_X_Tfidf)))

Accuracy: 0.9466666666666667


In [32]:
# naive bayes
nb = naive_bayes.MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), lr.coef_[0]
    )
}

print('Logistic Regression & cv\n')
print('Words that predict psychopathy\n')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
    
print('======================================')
print('Words that predict non-psychopathy\n')
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

Logistic Regression & cv

Words that predict psychopathy

('sausage', 0.6277481174226662)
('tai', 0.4634724226341537)
('ruin', 0.3667804121958295)
('bye', 0.2891842181374162)
('ugh', 0.28741289347150106)
('fuk', 0.28369162306603196)
('fukin', 0.28369162306603196)
('argh', 0.2826925270853313)
('bacon', 0.2781631249721203)
('brain', 0.27778190910892475)
Words that predict non-psychopathy

('good', -0.501469942780011)
('lol', -0.4145327869797684)
('work', -0.39758410450642157)
('quot', -0.39049547576999843)
('love', -0.38471334190137174)
('thanks', -0.3234263535041009)
('well', -0.3128893019731597)
('would', -0.2884208311964785)
('amp', -0.2875480584555549)
('one', -0.2786353644233663)


In [34]:
feature_to_coef = {
    word: coef for word, coef in zip(
        Tfidf_vect.get_feature_names(), lr.coef_[0]
    )
}

print('Logistic Regression & tf-idf\n')
print('Words that predict psychopathy\n')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
    
print('======================================')
print('Words that predict non-psychopathy\n')
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

Logistic Regression & tf-idf

Words that predict psychopathy

('sausage', 0.6277481174226662)
('tai', 0.4634724226341537)
('ruin', 0.3667804121958295)
('bye', 0.2891842181374162)
('ugh', 0.28741289347150106)
('fuk', 0.28369162306603196)
('fukin', 0.28369162306603196)
('argh', 0.2826925270853313)
('bacon', 0.2781631249721203)
('brain', 0.27778190910892475)
Words that predict non-psychopathy

('good', -0.501469942780011)
('lol', -0.4145327869797684)
('work', -0.39758410450642157)
('quot', -0.39049547576999843)
('love', -0.38471334190137174)
('thanks', -0.3234263535041009)
('well', -0.3128893019731597)
('would', -0.2884208311964785)
('amp', -0.2875480584555549)
('one', -0.2786353644233663)


In [35]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), nb.coef_[0]
    )
}

print('Naive Bayes\n')
print('Words that predict psychopathy\n')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
    
print('======================================')
print('Words that predict non-psychopathy\n')
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

Naive Bayes

Words that predict psychopathy

('go', -6.911215950611117)
('get', -7.134359501925328)
('like', -7.221370878914957)
('day', -7.316681058719282)
('feel', -7.316681058719282)
('lol', -7.422041574377108)
('miss', -7.422041574377108)
('try', -7.422041574377108)
('wait', -7.422041574377108)
('ca', -7.539824610033492)
Words that predict non-psychopathy

('aa', -9.619266151713328)
('aaa', -9.619266151713328)
('aaaaaaaaaaaaaaaahhhhhhh', -9.619266151713328)
('aaaaaaaaaaaalbum', -9.619266151713328)
('aaaaaaaaah', -9.619266151713328)
('aaaaaaahhh', -9.619266151713328)
('aaaaaaaw', -9.619266151713328)
('aaaaaah', -9.619266151713328)
('aaaaah', -9.619266151713328)
('aaaah', -9.619266151713328)


In [36]:
def predict_psychopath(input_string):
    
    lower_string = input_string.lower()
    token_string = word_tokenize(lower_string)
    
    final_words = []
    for word, tag in pos_tag(token_string):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            final_words.append(word_final)
            
    transformed_string = Tfidf_vect.transform(final_words)
    
    if lr.predict(transformed_string)[0] == 1:
        print(lr.predict_proba(transformed_string)[0][1])
        return "Psychopathic!!!"
    else:
        print(lr.predict_proba(transformed_string)[0][1])
        return "You are normal"

predict_psychopath("gah. can't sleep. super anxious about getting Lolas shots done in the morning I hate watching my babies go through it! ugh its the worst")

0.08056563689431563


'You are normal'

In [37]:
# Save files
import pickle
pickle.dump(lr, open('logistic_regression_psychopath_model.sav', 'wb'))
pickle.dump(cv, open('text_transformation_model.sav', 'wb'))